diff --git a/tests/kernels/moe/test_cutlass_moe.py b/tests/kernels/moe/test_cutlass_moe.py index 1380281bb2e..e3315142a9b 100644 --- a/tests/kernels/moe/test_cutlass_moe.py +++ b/tests/kernels/moe/test_cutlass_moe.py @@ -205,7 +205,10 @@ def run_with_expert_maps( w2 = kwargs["w2"] a = kwargs["hidden_states"] moe_config = make_dummy_moe_config( - num_experts=w2.shape[0], + max_num_tokens=kwargs.get("hidden_states").shape[0], + experts_per_token=kwargs.get("topk_ids").shape[1], + num_experts=num_experts, + num_local_experts=num_local_experts, hidden_dim=w2.shape[1], intermediate_size_per_partition=w2.shape[2], in_dtype=a.dtype, @@ -258,23 +261,27 @@ def run_8_bit( a1_scale=None, ) + num_experts = moe_tensors.w1.size(0) # type: ignore[attr-defined] + with_ep = num_local_experts is not None or num_local_experts == num_experts + kwargs = { "hidden_states": moe_tensors.a, "w1": moe_tensors.w1_q, # type: ignore[union-attr] "w2": moe_tensors.w2_q, # type: ignore[union-attr] "topk_weights": topk_weights, "topk_ids": topk_ids, - "global_num_experts": moe_tensors.w1_q.shape[0], # type: ignore[union-attr] + "global_num_experts": num_experts, "activation": MoEActivation.SILU, "expert_map": None, "apply_router_weight_on_input": False, } - num_experts = moe_tensors.w1.size(0) # type: ignore[attr-defined] - with_ep = num_local_experts is not None or num_local_experts == num_experts if not with_ep: moe_config = make_dummy_moe_config( - num_experts=moe_tensors.w2_q.shape[0], # type: ignore[union-attr] + max_num_tokens=moe_tensors.a.shape[0], + experts_per_token=topk_ids.shape[1], + num_experts=num_experts, + num_local_experts=num_local_experts, hidden_dim=moe_tensors.w2_q.shape[1], # type: ignore[union-attr] intermediate_size_per_partition=moe_tensors.w2_q.shape[2], # type: ignore[union-attr] in_dtype=moe_tensors.a.dtype, @@ -581,6 +588,7 @@ def test_run_cutlass_moe_fp8( per_out_channel, False, topk_weights, + None, ) workspace13.random_() diff --git a/tests/kernels/moe/utils.py b/tests/kernels/moe/utils.py index 3503ce4cdeb..ebb99576756 100644 --- a/tests/kernels/moe/utils.py +++ b/tests/kernels/moe/utils.py @@ -49,10 +49,12 @@ def shuffle_weight(w: torch.Tensor) -> torch.Tensor: def make_dummy_moe_config( num_experts: int = 1, + num_local_experts: int | None = None, experts_per_token: int = 1, hidden_dim: int = 1, intermediate_size_per_partition: int = 1, in_dtype: torch.dtype = torch.bfloat16, + max_num_tokens: int = 512, ) -> FusedMoEConfig: """ This is a dummy config for the mk constructor interface @@ -66,14 +68,16 @@ def make_dummy_moe_config( experts_per_token=experts_per_token, hidden_dim=hidden_dim, intermediate_size_per_partition=intermediate_size_per_partition, - num_local_experts=num_experts, + num_local_experts=num_local_experts + if num_local_experts is not None + else num_experts, num_logical_experts=num_experts, moe_parallel_config=FusedMoEParallelConfig.make_no_parallel(), activation=MoEActivation.SILU, in_dtype=in_dtype, device="cuda", routing_method=RoutingMethodType.TopK, - max_num_tokens=512, + max_num_tokens=max_num_tokens, ) diff --git a/vllm/model_executor/layers/fused_moe/experts/cutlass_moe.py b/vllm/model_executor/layers/fused_moe/experts/cutlass_moe.py index d8570049af2..fa91804f35c 100644 --- a/vllm/model_executor/layers/fused_moe/experts/cutlass_moe.py +++ b/vllm/model_executor/layers/fused_moe/experts/cutlass_moe.py @@ -379,8 +379,7 @@ class CutlassExpertsFp8Base(mk.FusedMoEExpertsModular): topk_ids, activation, global_num_experts, - # the fp8 cutlass experts use their own expert map. - None, + expert_map, self.w1_scale, self.w2_scale, a1q_scale,