mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-13 22:18:36 +08:00
[None][chore] improve the readability of log for cutlass can only support fp8-blockwise on hopper
Signed-off-by: xxi <xxi@nvidia.com>
This commit is contained in:
parent
ba1037ca4a
commit
f610f1b69c
@ -248,14 +248,25 @@ def fused_moe(
|
||||
)
|
||||
|
||||
run_moe = moe_runner.fused_moe_runner.run_moe_min_latency if min_latency_mode else moe_runner.fused_moe_runner.run_moe
|
||||
output = run_moe(input, token_selected_experts, token_final_scales,
|
||||
fc1_expert_weights, fc1_expert_biases, fc2_expert_weights,
|
||||
fc2_expert_biases, quant_scales, input_sf,
|
||||
swizzled_input_sf, swiglu_alpha, swiglu_beta, swiglu_limit,
|
||||
tp_size, tp_rank, ep_size, ep_rank, cluster_size,
|
||||
cluster_rank, enable_alltoall, min_latency_mode,
|
||||
[gemm_tactic_1, gemm_tactic_2], activation_type,
|
||||
unpadded_hidden_size, tuner_num_tokens, out_tensor)
|
||||
try:
|
||||
output = run_moe(input, token_selected_experts, token_final_scales,
|
||||
fc1_expert_weights, fc1_expert_biases,
|
||||
fc2_expert_weights, fc2_expert_biases, quant_scales,
|
||||
input_sf, swizzled_input_sf, swiglu_alpha, swiglu_beta,
|
||||
swiglu_limit, tp_size, tp_rank, ep_size, ep_rank,
|
||||
cluster_size, cluster_rank, enable_alltoall,
|
||||
min_latency_mode, [gemm_tactic_1, gemm_tactic_2],
|
||||
activation_type, unpadded_hidden_size,
|
||||
tuner_num_tokens, out_tensor)
|
||||
except RuntimeError as e:
|
||||
error_msg = str(e)
|
||||
if "DeepGEMM only supports Hopper" in error_msg:
|
||||
raise RuntimeError(
|
||||
f"{error_msg}"
|
||||
"Note: This is the Cutlass backend with DeepGemm JIT path. "
|
||||
"For Blackwell (SM100+) support, please use the DEEPGEMM backend instead."
|
||||
) from e
|
||||
raise
|
||||
|
||||
return output if min_latency_mode else [output]
|
||||
|
||||
|
||||
Loading…
Reference in New Issue
Block a user