[None][chore] improve the readability of log for cutlass can only support fp8-blockwise on hopper

Signed-off-by: xxi <xxi@nvidia.com>
2026-01-13 22:18:36 +08:00 · 2026-01-13 01:25:43 -08:00 · 2026-01-13 01:25:43 -08:00 · f610f1b69c
commit f610f1b69c
parent ba1037ca4a
1 changed files with 19 additions and 8 deletions
--- a/tensorrt_llm/_torch/custom_ops/torch_custom_ops.py
+++ b/tensorrt_llm/_torch/custom_ops/torch_custom_ops.py
@ -248,14 +248,25 @@ def fused_moe(
    )

    run_moe = moe_runner.fused_moe_runner.run_moe_min_latency if min_latency_mode else moe_runner.fused_moe_runner.run_moe
-    output = run_moe(input, token_selected_experts, token_final_scales,
-                     fc1_expert_weights, fc1_expert_biases, fc2_expert_weights,
-                     fc2_expert_biases, quant_scales, input_sf,
-                     swizzled_input_sf, swiglu_alpha, swiglu_beta, swiglu_limit,
-                     tp_size, tp_rank, ep_size, ep_rank, cluster_size,
-                     cluster_rank, enable_alltoall, min_latency_mode,
-                     [gemm_tactic_1, gemm_tactic_2], activation_type,
-                     unpadded_hidden_size, tuner_num_tokens, out_tensor)
+    try:
+        output = run_moe(input, token_selected_experts, token_final_scales,
+                         fc1_expert_weights, fc1_expert_biases,
+                         fc2_expert_weights, fc2_expert_biases, quant_scales,
+                         input_sf, swizzled_input_sf, swiglu_alpha, swiglu_beta,
+                         swiglu_limit, tp_size, tp_rank, ep_size, ep_rank,
+                         cluster_size, cluster_rank, enable_alltoall,
+                         min_latency_mode, [gemm_tactic_1, gemm_tactic_2],
+                         activation_type, unpadded_hidden_size,
+                         tuner_num_tokens, out_tensor)
+    except RuntimeError as e:
+        error_msg = str(e)
+        if "DeepGEMM only supports Hopper" in error_msg:
+            raise RuntimeError(
+                f"{error_msg}"
+                "Note: This is the Cutlass backend with DeepGemm JIT path. "
+                "For Blackwell (SM100+) support, please use the DEEPGEMM backend instead."
+            ) from e
+        raise

    return output if min_latency_mode else [output]