[5356427] fix: Remove the seq_len of 4096 from FP8 block scale MoE tuning configs. (#5485)

The seq_len of 4096 will cause some unknown CUDA illegal memory access issue if run with some other tests consecutively.
Put a saturated upper bound for any sequence length larger than it.
This commit is contained in:
Yukun He 2025-06-26 08:38:35 +08:00 committed by GitHub
parent 74ae15a26b
commit 3fc57543e2
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -4,7 +4,8 @@ from typing import List, Tuple
import torch
from tensorrt_llm._torch.utils import last_positive_power_of_2
from tensorrt_llm._torch.utils import (get_last_power_of_2_num_tokens_buckets,
last_positive_power_of_2)
from ..autotuner import (AutoTuner, ConstraintSpec, DynamicTensorSpec,
OptimizationProfile, TunableRunner, TuningConfig)
@ -123,8 +124,8 @@ class FP8BlockScaleMoERunner(TunableRunner):
HIDDEN_STATES_IDX = 2
TUNED_DIM = 0
m_values = (1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096)
round_rule = lambda x: last_positive_power_of_2(x)
m_values = get_last_power_of_2_num_tokens_buckets(2048)
round_rule = lambda x: min(last_positive_power_of_2(x), 2048)
specs = (DynamicTensorSpec(HIDDEN_STATES_IDX, TUNED_DIM, m_values,
round_rule), )