[5356427] fix: Remove the seq_len of 4096 from FP8 block scale MoE tuning configs. (#5485)

The seq_len of 4096 will cause some unknown CUDA illegal memory access issue if run with some other tests consecutively. Put a saturated upper bound for any sequence length larger than it.
2026-01-14 06:27:45 +08:00 · 2025-06-26 08:38:35 +08:00 · 2025-06-26 08:38:35 +08:00 · 3fc57543e2
commit 3fc57543e2
parent 74ae15a26b
1 changed files with 4 additions and 3 deletions
--- a/tensorrt_llm/_torch/custom_ops/trtllm_gen_custom_ops.py
+++ b/tensorrt_llm/_torch/custom_ops/trtllm_gen_custom_ops.py
@ -4,7 +4,8 @@ from typing import List, Tuple

 import torch

-from tensorrt_llm._torch.utils import last_positive_power_of_2
+from tensorrt_llm._torch.utils import (get_last_power_of_2_num_tokens_buckets,
+                                       last_positive_power_of_2)

 from ..autotuner import (AutoTuner, ConstraintSpec, DynamicTensorSpec,
                         OptimizationProfile, TunableRunner, TuningConfig)
@ -123,8 +124,8 @@ class FP8BlockScaleMoERunner(TunableRunner):
        HIDDEN_STATES_IDX = 2
        TUNED_DIM = 0

-        m_values = (1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096)
-        round_rule = lambda x: last_positive_power_of_2(x)
+        m_values = get_last_power_of_2_num_tokens_buckets(2048)
+        round_rule = lambda x: min(last_positive_power_of_2(x), 2048)

        specs = (DynamicTensorSpec(HIDDEN_STATES_IDX, TUNED_DIM, m_values,
                                   round_rule), )