mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-02-04 18:21:52 +08:00
[None][chore] switch to ConfigurableMoE as the default path (#10792)
Signed-off-by: xxi <xxi@nvidia.com>
This commit is contained in:
parent
a4152c80f6
commit
9feebb3a27
@ -167,6 +167,7 @@ class ConfigurableMoE(MoE):
|
||||
swiglu_limit=kwargs.get("swiglu_limit"),
|
||||
init_load_balancer=False,
|
||||
without_comm=True,
|
||||
activation_type=self.activation_type,
|
||||
)
|
||||
|
||||
self.validate_backend(backend)
|
||||
|
||||
@ -344,7 +344,7 @@ def create_moe(
|
||||
moe_cls = get_moe_cls(model_config, override_quant_config)
|
||||
|
||||
enable_configurable_moe = os.environ.get("ENABLE_CONFIGURABLE_MOE",
|
||||
"0") == "1"
|
||||
"1") == "1"
|
||||
if enable_configurable_moe or moe_cls == CuteDslFusedMoE:
|
||||
if moe_cls in (DeepGemmFusedMoE, TRTLLMGenFusedMoE, CuteDslFusedMoE,
|
||||
CutlassFusedMoE):
|
||||
@ -365,6 +365,7 @@ def create_moe(
|
||||
swiglu_alpha=swiglu_alpha,
|
||||
swiglu_beta=swiglu_beta,
|
||||
swiglu_limit=swiglu_limit,
|
||||
activation_type=activation_type,
|
||||
)
|
||||
else:
|
||||
# Check if this is a TRTLLM backend request that fallback to CutlassFusedMoE
|
||||
@ -378,10 +379,12 @@ def create_moe(
|
||||
f"ConfigurableMoE only supports TRTLLMGenFusedMoE and CuteDslFusedMoE backends. "
|
||||
f"Continuing with legacy MoE backend {moe_cls.__name__}.")
|
||||
else:
|
||||
# For other incompatible backends, raise error
|
||||
raise ValueError(
|
||||
f"ENABLE_CONFIGURABLE_MOE is set but backend {moe_cls.__name__} is not supported. "
|
||||
f"ConfigurableMoE only supports TRTLLMGenFusedMoE backend.")
|
||||
# Other backends are not supported by ConfigurableMoE, fallback to legacy backend
|
||||
# This is a WAR to make sure all the CI test cases pass.
|
||||
# TODO: Remove this workaround when ConfigurableMoE is supported by all backends.
|
||||
logger.warning(
|
||||
f"ENABLE_CONFIGURABLE_MOE is set but {moe_cls.__name__} is not supported by ConfigurableMoE. "
|
||||
f"Continuing with legacy MoE backend {moe_cls.__name__}.")
|
||||
|
||||
# Use legacy create_moe_backend for other backends or when ConfigurableMoE is disabled
|
||||
return create_moe_backend(
|
||||
|
||||
@ -86,14 +86,6 @@ l0_b200:
|
||||
- unittest/tools/test_layer_wise_benchmarks.py::test_nemotron_gen_dep[1]
|
||||
- unittest/tools/test_layer_wise_benchmarks.py::test_qwen3_next_gen_tep[1]
|
||||
- unittest/_torch/modeling/test_modeling_exaone4.py::TestEXAONE4::test_llm_load_1_FP8
|
||||
- unittest/_torch/modules/test_fused_moe.py::test_fused_moe_nvfp4[enable_configurable_moe-disable_finalize_fusion-TRTLLM-dtype1]
|
||||
- unittest/_torch/modules/test_fused_moe.py::test_fused_moe_w4a8_nvfp4_fp8[enable_configurable_moe-TRTLLM]
|
||||
- unittest/_torch/modules/test_fused_moe.py::test_fused_moe_mxfp4_mxfp8[enable_configurable_moe-True-8-64-TRTLLM]
|
||||
- unittest/_torch/modules/test_fused_moe.py::test_fused_moe_wfp4a16[enable_configurable_moe-TRTLLM-2880-dtype0]
|
||||
- unittest/_torch/modules/test_fused_moe.py::test_fused_moe_nvfp4[enable_configurable_moe-enable_finalize_fusion-CUTLASS-dtype1]
|
||||
- unittest/_torch/modules/test_fused_moe.py::test_fused_moe_w4a8_nvfp4_fp8[enable_configurable_moe-CUTLASS]
|
||||
- unittest/_torch/modules/test_fused_moe.py::test_fused_moe_mxfp4_mxfp8[enable_configurable_moe-True-8-256-CUTLASS]
|
||||
- unittest/_torch/modules/test_fused_moe.py::test_fused_moe_fp8_blockwise_deepgemm[enable_configurable_moe-dtype1-72-256-2560-DefaultMoeRoutingMethod]
|
||||
- condition:
|
||||
ranges:
|
||||
system_gpu_count:
|
||||
@ -162,7 +154,6 @@ l0_b200:
|
||||
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=2-fp8kv=False-attention_dp=False-cuda_graph=True-overlap_scheduler=False-torch_compile=False]
|
||||
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=TRTLLM-mtp_nextn=2-fp8kv=False-attention_dp=False-cuda_graph=True-overlap_scheduler=False-torch_compile=False]
|
||||
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTEDSL-mtp_nextn=0-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False]
|
||||
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales[enable_configurable_moe-mtp=disable-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=True]
|
||||
- accuracy/test_llm_api_pytorch.py::TestSeedOss_36B::test_auto_dtype
|
||||
- accuracy/test_llm_api_pytorch.py::TestLlama3_1_8B_Instruct_RocketKV::test_auto_dtype
|
||||
# ------------- AutoDeploy Backend Stages ---------------
|
||||
|
||||
@ -18,10 +18,6 @@ l0_dgx_b200:
|
||||
- unittest/_torch/misc/test_autotuner.py::test_autotuner_distributed_strategy
|
||||
- unittest/_torch/modules/test_fused_moe.py::test_fused_moe_alltoall_fp4[DeepEPLowLatency]
|
||||
- unittest/_torch/modules/test_fused_moe.py::test_fused_moe_alltoall_fp4[NVLinkTwoSided]
|
||||
- unittest/_torch/modules/test_fused_moe.py::test_fused_moe_nvfp4[enable_configurable_moe-disable_finalize_fusion-TRTLLM-dtype1]
|
||||
- unittest/_torch/modules/test_fused_moe.py::test_fused_moe_w4a8_nvfp4_fp8[enable_configurable_moe-TRTLLM]
|
||||
- unittest/_torch/modules/test_fused_moe.py::test_fused_moe_mxfp4_mxfp8[enable_configurable_moe-True-8-64-TRTLLM]
|
||||
- unittest/_torch/modules/test_fused_moe.py::test_fused_moe_wfp4a16[enable_configurable_moe-TRTLLM-2880-dtype0]
|
||||
- accuracy/test_llm_api_pytorch.py::TestNemotronV3Super::test_auto_dtype_4gpus[4-4-False-True-True]
|
||||
- accuracy/test_llm_api_pytorch.py::TestNemotronV3Super::test_auto_dtype_4gpus[4-4-True-True-True]
|
||||
- accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[tep4_latency_moe_trtllm-torch_compile=True]
|
||||
|
||||
@ -97,12 +97,6 @@ l0_gb200_multi_gpus:
|
||||
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=2-tp2pp2-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False]
|
||||
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=2-pp4-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False]
|
||||
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=TRTLLM-mtp_nextn=2-tp4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False]
|
||||
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[enable_configurable_moe-moe_backend=TRTLLM-mtp_nextn=0-tp4-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False]
|
||||
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[enable_configurable_moe-moe_backend=TRTLLM-mtp_nextn=0-ep4-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=True]
|
||||
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[enable_configurable_moe-tp4-trtllm-fp8]
|
||||
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[enable_configurable_moe-ep4-trtllm-fp8]
|
||||
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[enable_configurable_moe-dp4-trtllm-fp8]
|
||||
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus_online_eplb[enable_configurable_moe-fp8]
|
||||
- accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4_4gpus[latency_moe_trtllm_eagle3] TIMEOUT (90)
|
||||
- accuracy/test_llm_api_pytorch.py::TestMistralLarge3_675B::test_nvfp4_4gpus[latency_moe_trtllm] TIMEOUT (90)
|
||||
- accuracy/test_llm_api_pytorch.py::TestMistralLarge3_675B::test_nvfp4_4gpus[latency_moe_trtllm_eagle] TIMEOUT (90)
|
||||
|
||||
Loading…
Reference in New Issue
Block a user