mirror of
https://github.com/vllm-project/vllm.git
synced 2026-06-06 00:16:14 +00:00
[Bugfix] Fix test_cutlass_moe.py (#44380)
Signed-off-by: Bill Nell <bnell@redhat.com>
This commit is contained in:
@@ -205,7 +205,10 @@ def run_with_expert_maps(
|
||||
w2 = kwargs["w2"]
|
||||
a = kwargs["hidden_states"]
|
||||
moe_config = make_dummy_moe_config(
|
||||
num_experts=w2.shape[0],
|
||||
max_num_tokens=kwargs.get("hidden_states").shape[0],
|
||||
experts_per_token=kwargs.get("topk_ids").shape[1],
|
||||
num_experts=num_experts,
|
||||
num_local_experts=num_local_experts,
|
||||
hidden_dim=w2.shape[1],
|
||||
intermediate_size_per_partition=w2.shape[2],
|
||||
in_dtype=a.dtype,
|
||||
@@ -258,23 +261,27 @@ def run_8_bit(
|
||||
a1_scale=None,
|
||||
)
|
||||
|
||||
num_experts = moe_tensors.w1.size(0) # type: ignore[attr-defined]
|
||||
with_ep = num_local_experts is not None or num_local_experts == num_experts
|
||||
|
||||
kwargs = {
|
||||
"hidden_states": moe_tensors.a,
|
||||
"w1": moe_tensors.w1_q, # type: ignore[union-attr]
|
||||
"w2": moe_tensors.w2_q, # type: ignore[union-attr]
|
||||
"topk_weights": topk_weights,
|
||||
"topk_ids": topk_ids,
|
||||
"global_num_experts": moe_tensors.w1_q.shape[0], # type: ignore[union-attr]
|
||||
"global_num_experts": num_experts,
|
||||
"activation": MoEActivation.SILU,
|
||||
"expert_map": None,
|
||||
"apply_router_weight_on_input": False,
|
||||
}
|
||||
|
||||
num_experts = moe_tensors.w1.size(0) # type: ignore[attr-defined]
|
||||
with_ep = num_local_experts is not None or num_local_experts == num_experts
|
||||
if not with_ep:
|
||||
moe_config = make_dummy_moe_config(
|
||||
num_experts=moe_tensors.w2_q.shape[0], # type: ignore[union-attr]
|
||||
max_num_tokens=moe_tensors.a.shape[0],
|
||||
experts_per_token=topk_ids.shape[1],
|
||||
num_experts=num_experts,
|
||||
num_local_experts=num_local_experts,
|
||||
hidden_dim=moe_tensors.w2_q.shape[1], # type: ignore[union-attr]
|
||||
intermediate_size_per_partition=moe_tensors.w2_q.shape[2], # type: ignore[union-attr]
|
||||
in_dtype=moe_tensors.a.dtype,
|
||||
@@ -581,6 +588,7 @@ def test_run_cutlass_moe_fp8(
|
||||
per_out_channel,
|
||||
False,
|
||||
topk_weights,
|
||||
None,
|
||||
)
|
||||
|
||||
workspace13.random_()
|
||||
|
||||
@@ -49,10 +49,12 @@ def shuffle_weight(w: torch.Tensor) -> torch.Tensor:
|
||||
|
||||
def make_dummy_moe_config(
|
||||
num_experts: int = 1,
|
||||
num_local_experts: int | None = None,
|
||||
experts_per_token: int = 1,
|
||||
hidden_dim: int = 1,
|
||||
intermediate_size_per_partition: int = 1,
|
||||
in_dtype: torch.dtype = torch.bfloat16,
|
||||
max_num_tokens: int = 512,
|
||||
) -> FusedMoEConfig:
|
||||
"""
|
||||
This is a dummy config for the mk constructor interface
|
||||
@@ -66,14 +68,16 @@ def make_dummy_moe_config(
|
||||
experts_per_token=experts_per_token,
|
||||
hidden_dim=hidden_dim,
|
||||
intermediate_size_per_partition=intermediate_size_per_partition,
|
||||
num_local_experts=num_experts,
|
||||
num_local_experts=num_local_experts
|
||||
if num_local_experts is not None
|
||||
else num_experts,
|
||||
num_logical_experts=num_experts,
|
||||
moe_parallel_config=FusedMoEParallelConfig.make_no_parallel(),
|
||||
activation=MoEActivation.SILU,
|
||||
in_dtype=in_dtype,
|
||||
device="cuda",
|
||||
routing_method=RoutingMethodType.TopK,
|
||||
max_num_tokens=512,
|
||||
max_num_tokens=max_num_tokens,
|
||||
)
|
||||
|
||||
|
||||
|
||||
@@ -379,8 +379,7 @@ class CutlassExpertsFp8Base(mk.FusedMoEExpertsModular):
|
||||
topk_ids,
|
||||
activation,
|
||||
global_num_experts,
|
||||
# the fp8 cutlass experts use their own expert map.
|
||||
None,
|
||||
expert_map,
|
||||
self.w1_scale,
|
||||
self.w2_scale,
|
||||
a1q_scale,
|
||||
|
||||
Reference in New Issue
Block a user