mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-02-03 01:31:30 +08:00
364 lines
14 KiB
Python
364 lines
14 KiB
Python
#!/usr/bin/env python3
|
|
# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
# SPDX-License-Identifier: Apache-2.0
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
import os
|
|
import unittest
|
|
|
|
import numpy as np
|
|
import pytest
|
|
import torch
|
|
|
|
import tensorrt_llm.bindings.internal.runtime as _tbr
|
|
|
|
|
|
class TestMoePythonBindings(unittest.TestCase):
|
|
|
|
def setUp(self):
|
|
os.environ["TLLM_HOST_ACCESSIBLE_ALLOW_MANAGED_FALLBACK"] = "1"
|
|
torch.cuda.set_device(0)
|
|
# Common test parameters
|
|
self.expert_count = 8
|
|
self.top_k = 2
|
|
self.ep_rank = 0 # Local rank
|
|
self.ep_size = 2 # Total ranks
|
|
self.slot_count_per_rank = 4
|
|
self.layer_updates_per_iter = 1
|
|
self.gathered_raw_expert_ids = torch.tensor(
|
|
[
|
|
[0, 1], # Token 0 selects experts 0 and 1
|
|
[1, 2], # Token 1 selects experts 1 and 2
|
|
[2, 3], # Token 2 selects experts 2 and 3
|
|
[0, 3] # Token 3 selects experts 0 and 3
|
|
],
|
|
dtype=torch.int32,
|
|
device="cuda")
|
|
|
|
def test_moe_weight_struct(self):
|
|
"""Test the Python binding of MoeWeight structure"""
|
|
# Create a MoeWeight instance
|
|
weight = _tbr.MoeWeight()
|
|
|
|
# Verify structure field access
|
|
weight.weight_ptr = 1024 # use a dummy value for pointer
|
|
weight.height = 10
|
|
weight.width = 1024
|
|
weight.pitch = 1024
|
|
|
|
# Verify __repr__ method
|
|
repr_str = str(weight)
|
|
self.assertIn("height=10", repr_str)
|
|
self.assertIn("width=1024", repr_str)
|
|
self.assertIn("pitch=1024", repr_str)
|
|
|
|
def test_moe_load_balancer_creation(self):
|
|
"""Test MoeLoadBalancer creation"""
|
|
# Create MoeLoadBalancer instance
|
|
balancer = _tbr.MoeLoadBalancer(
|
|
ep_rank=self.ep_rank,
|
|
ep_size=self.ep_size,
|
|
layer_updates_per_iter=self.layer_updates_per_iter)
|
|
self.assertIsNotNone(balancer)
|
|
|
|
def test_add_layer(self):
|
|
"""Test adding a layer to MoeLoadBalancer"""
|
|
balancer = _tbr.MoeLoadBalancer(
|
|
ep_rank=self.ep_rank,
|
|
ep_size=self.ep_size,
|
|
layer_updates_per_iter=self.layer_updates_per_iter)
|
|
|
|
# Add a layer and verify return value type
|
|
layer0 = balancer.add_layer(
|
|
expert_count=self.expert_count,
|
|
top_k=self.top_k,
|
|
slot_count_per_rank=self.slot_count_per_rank)
|
|
|
|
self.assertIsInstance(layer0, _tbr.SingleLayerMoeLoadBalancer)
|
|
self.assertEqual(layer0.get_layer_id(), 0)
|
|
|
|
# Add another layer and verify return value type
|
|
layer1 = balancer.add_layer(
|
|
expert_count=self.expert_count,
|
|
top_k=self.top_k,
|
|
slot_count_per_rank=self.slot_count_per_rank)
|
|
self.assertIsInstance(layer1, _tbr.SingleLayerMoeLoadBalancer)
|
|
self.assertEqual(layer1.get_layer_id(), 1)
|
|
|
|
def _create_weight_buffers(self, weight_height, weight_width, num_experts,
|
|
num_slots):
|
|
"""Create weight buffers for testing"""
|
|
# Create CPU weights
|
|
host_weights = []
|
|
for i in range(num_experts):
|
|
# Create a unique weight for each expert
|
|
host_data = np.ones(
|
|
(weight_height, weight_width), dtype=np.float32) * (i + 1)
|
|
host_buffer = torch.tensor(host_data,
|
|
dtype=torch.float32).contiguous()
|
|
host_weight = _tbr.MoeWeight()
|
|
host_weight.weight_ptr = host_buffer.data_ptr()
|
|
host_weight.height = weight_height
|
|
host_weight.width = weight_width * 4 # float32 = 4 bytes
|
|
host_weight.pitch = weight_width * 4 # pitch = width (contiguous memory)
|
|
host_weights.append((host_buffer, host_weight))
|
|
|
|
# Create GPU weight slots
|
|
slot_weights = []
|
|
for i in range(num_slots):
|
|
# Create a GPU buffer for each slot
|
|
cuda_buffer = torch.zeros((weight_height, weight_width),
|
|
dtype=torch.float32,
|
|
device='cuda').contiguous()
|
|
gpu_weight = _tbr.MoeWeight()
|
|
gpu_weight.weight_ptr = cuda_buffer.data_ptr()
|
|
gpu_weight.height = weight_height
|
|
gpu_weight.width = weight_width * 4 # float32 = 4 bytes
|
|
gpu_weight.pitch = weight_width * 4 # pitch = width (contiguous memory)
|
|
slot_weights.append((cuda_buffer, gpu_weight))
|
|
|
|
return host_weights, slot_weights
|
|
|
|
def test_single_layer_moe_load_balancer_operations(self):
|
|
"""Test operations of SingleLayerMoeLoadBalancer"""
|
|
# Create MoeLoadBalancer instance
|
|
balancer = _tbr.MoeLoadBalancer(
|
|
ep_rank=self.ep_rank,
|
|
ep_size=self.ep_size,
|
|
layer_updates_per_iter=self.layer_updates_per_iter)
|
|
|
|
balancer.set_use_gpu_memcpy(True)
|
|
|
|
# Add a layer
|
|
layer = balancer.add_layer(expert_count=self.expert_count,
|
|
top_k=self.top_k,
|
|
slot_count_per_rank=self.slot_count_per_rank)
|
|
|
|
# Create weight buffers for testing
|
|
weight_height = 10
|
|
weight_width = 1024
|
|
host_weights, slot_weights = self._create_weight_buffers(
|
|
weight_height, weight_width, self.expert_count,
|
|
self.slot_count_per_rank)
|
|
|
|
# Add weight slots
|
|
for slot_id, (_, slot_weight) in enumerate(slot_weights):
|
|
layer.add_single_weight_slot(slot_id, "test_weight", slot_weight)
|
|
|
|
# Add host weights
|
|
for expert_id, (_, host_weight) in enumerate(host_weights):
|
|
layer.add_single_host_weight(expert_id, "test_weight", host_weight)
|
|
|
|
# Create initial weight assignments
|
|
initial_assignments = []
|
|
for r in range(self.ep_size):
|
|
expert_start = r * self.expert_count // self.ep_size
|
|
for slot_id in range(self.slot_count_per_rank):
|
|
expert_id = (expert_start + slot_id) % self.expert_count
|
|
initial_assignments.append(expert_id)
|
|
|
|
# Set initial weight assignments
|
|
layer.set_initial_weight_assignments(initial_assignments)
|
|
|
|
# Finalize model setup
|
|
balancer.finalize_model()
|
|
|
|
# Run one iteration
|
|
balancer.set_warm_up_iter_count(1)
|
|
balancer.start_iter(
|
|
0, True, True) # Iteration 0, enable statistics, enable updates
|
|
|
|
enabled = torch.ops.trtllm.moe_load_balance_wait_gpu_stage(
|
|
layer.get_pointer())
|
|
torch.ops.trtllm.moe_load_balance_statistic(
|
|
self.gathered_raw_expert_ids, enabled, layer.get_pointer(), True,
|
|
True)
|
|
torch.ops.trtllm.moe_load_balance_set_cpu_stage(layer.get_pointer())
|
|
|
|
balancer.end_iter(0)
|
|
|
|
# Run a second iteration
|
|
balancer.start_iter(
|
|
1, True, True) # Iteration 1, enable statistics, enable updates
|
|
enabled = torch.ops.trtllm.moe_load_balance_wait_gpu_stage(
|
|
layer.get_pointer())
|
|
torch.ops.trtllm.moe_load_balance_statistic(
|
|
self.gathered_raw_expert_ids, enabled, layer.get_pointer(), True,
|
|
True)
|
|
torch.ops.trtllm.moe_load_balance_set_cpu_stage(layer.get_pointer())
|
|
balancer.end_iter(1)
|
|
|
|
# Shutdown the load balancer
|
|
balancer.shutdown()
|
|
|
|
def test_moe_load_balancer_multiple_layers(self):
|
|
"""Test MoeLoadBalancer with multiple layers"""
|
|
# Create MoeLoadBalancer instance
|
|
balancer = _tbr.MoeLoadBalancer(
|
|
ep_rank=self.ep_rank,
|
|
ep_size=self.ep_size,
|
|
layer_updates_per_iter=self.layer_updates_per_iter)
|
|
|
|
balancer.set_use_gpu_memcpy(True)
|
|
|
|
# Create initial weight assignments
|
|
initial_assignments = []
|
|
for r in range(self.ep_size):
|
|
expert_start = r * self.expert_count // self.ep_size
|
|
for slot_id in range(self.slot_count_per_rank):
|
|
expert_id = (expert_start + slot_id) % self.expert_count
|
|
initial_assignments.append(expert_id)
|
|
|
|
# Add multiple layers
|
|
num_layers = 3
|
|
layers = []
|
|
for _ in range(num_layers):
|
|
layer = balancer.add_layer(
|
|
expert_count=self.expert_count,
|
|
top_k=self.top_k,
|
|
slot_count_per_rank=self.slot_count_per_rank)
|
|
layers.append(layer)
|
|
layer.set_initial_weight_assignments(initial_assignments)
|
|
|
|
# Verify that we got multiple different layers
|
|
self.assertEqual(len(layers), num_layers)
|
|
for i in range(num_layers):
|
|
for j in range(i + 1, num_layers):
|
|
self.assertIsNot(
|
|
layers[i], layers[j],
|
|
f"Layer {i} and {j} should be different objects")
|
|
|
|
# Finalize model setup
|
|
balancer.finalize_model()
|
|
|
|
# Set warm-up iteration count
|
|
balancer.set_warm_up_iter_count(2)
|
|
|
|
# Run several iterations
|
|
configs = [
|
|
(0, True,
|
|
True), # Iteration 0: Enable statistics, enable updates (warm-up)
|
|
(1, True,
|
|
True), # Iteration 1: Enable statistics, enable updates (warm-up)
|
|
(2, True, True), # Iteration 2: Enable statistics, enable updates
|
|
(3, True, False), # Iteration 3: Enable statistics, disable updates
|
|
(4, False,
|
|
False), # Iteration 4: Disable statistics, disable updates
|
|
]
|
|
|
|
for iter_id, enable_statistic, enable_update_weights in configs:
|
|
balancer.start_iter(iter_id, enable_statistic,
|
|
enable_update_weights)
|
|
for layer in layers:
|
|
enabled = torch.ops.trtllm.moe_load_balance_wait_gpu_stage(
|
|
layer.get_pointer())
|
|
torch.ops.trtllm.moe_load_balance_statistic(
|
|
self.gathered_raw_expert_ids, enabled, layer.get_pointer(),
|
|
enable_statistic, enable_update_weights)
|
|
torch.ops.trtllm.moe_load_balance_set_cpu_stage(
|
|
layer.get_pointer())
|
|
balancer.end_iter(iter_id)
|
|
|
|
# Shutdown the load balancer
|
|
balancer.shutdown()
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"expert_count,slot_count_per_rank,ep_size,expert_load_factors,expected_replica_counts",
|
|
[(4, 2, 2, [1, 2, 3, 4], [1, 1, 1, 1]),
|
|
(4, 3, 2, [0, 0, 0, 0], [2, 2, 1, 1]),
|
|
(4, 4, 2, [1, 4, 2, 1], [1, 4, 2, 1]),
|
|
(5, 3, 3, [10, 5, 2, 15, 8], [2, 1, 1, 3, 2]),
|
|
(3, 2, 2, [0, 10, 0], [1, 2, 1])],
|
|
ids=[
|
|
"equal_slots_and_experts", "zero_load_factors", "varied_load_factors",
|
|
"complex_load_distribution", "single_expert_with_load"
|
|
])
|
|
def test_do_replication(expert_count, slot_count_per_rank, ep_size,
|
|
expert_load_factors, expected_replica_counts):
|
|
"""Test the doReplication function.
|
|
|
|
Reference: MoeReplicationTest in cpp/tests/runtime/moeLoadBalancerTest.cpp
|
|
"""
|
|
meta_info = _tbr.MoeLoadBalanceMetaInfo(
|
|
expert_count=expert_count,
|
|
top_k=2,
|
|
ep_rank=0,
|
|
ep_size=ep_size,
|
|
slot_count_per_rank=slot_count_per_rank)
|
|
placement_info = _tbr.MoePlacementCpuInfo()
|
|
placement_info.expert_replica_count = [0] * expert_count
|
|
|
|
_tbr.do_replication(meta_info, expert_load_factors, placement_info)
|
|
|
|
assert placement_info.expert_replica_count == expected_replica_counts
|
|
assert sum(
|
|
placement_info.expert_replica_count) == ep_size * slot_count_per_rank
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"expert_count,slot_count_per_rank,ep_size,expert_load_factors,replica_counts,max_expected_load_diff",
|
|
[(4, 2, 2, [1, 1, 1, 1], [1, 1, 1, 1], 0.001),
|
|
(4, 3, 2, [1, 3, 2, 1], [1, 2, 2, 1], 0.01),
|
|
(5, 3, 3, [10, 5, 2, 15, 8], [2, 1, 1, 3, 2], 2.0),
|
|
[3, 2, 2, [1, 100, 10], [1, 2, 1], 9.0],
|
|
[6, 3, 2, [2, 3, 4, 5, 6, 7], [1, 1, 1, 1, 1, 1], 1.0]],
|
|
ids=[
|
|
"equal_load_factors", "varied_load_factors",
|
|
"complex_load_distribution", "extreme_load_differences",
|
|
"one_replica_per_expert"
|
|
])
|
|
def test_do_placement(expert_count, slot_count_per_rank, ep_size,
|
|
expert_load_factors, replica_counts,
|
|
max_expected_load_diff):
|
|
"""Test the doPlacement function.
|
|
|
|
Reference: MoePlacementTest in cpp/tests/runtime/moeLoadBalancerTest.cpp
|
|
"""
|
|
meta_info = _tbr.MoeLoadBalanceMetaInfo(
|
|
expert_count=expert_count,
|
|
top_k=2,
|
|
ep_rank=0,
|
|
ep_size=ep_size,
|
|
slot_count_per_rank=slot_count_per_rank)
|
|
placement_info = _tbr.MoePlacementCpuInfo()
|
|
placement_info.expert_replica_count = replica_counts
|
|
placement_info.rank_expert_ids = [[0] * slot_count_per_rank
|
|
for _ in range(ep_size)]
|
|
|
|
_tbr.do_placement(meta_info, expert_load_factors, placement_info)
|
|
|
|
expert_assignments = [0] * expert_count
|
|
for local_expert_ids in placement_info.rank_expert_ids:
|
|
for expert_id in local_expert_ids:
|
|
expert_assignments[expert_id] += 1
|
|
|
|
assert expert_assignments == replica_counts
|
|
|
|
rank_loads = [0] * ep_size
|
|
for rank_id, local_expert_ids in enumerate(placement_info.rank_expert_ids):
|
|
for expert_id in local_expert_ids:
|
|
rank_loads[rank_id] += expert_load_factors[
|
|
expert_id] / replica_counts[expert_id]
|
|
|
|
max_load_diff = 0
|
|
for i in range(ep_size):
|
|
for j in range(i + 1, ep_size):
|
|
max_load_diff = max(max_load_diff,
|
|
abs(rank_loads[i] - rank_loads[j]))
|
|
assert max_load_diff <= max_expected_load_diff
|
|
|
|
|
|
if __name__ == '__main__':
|
|
unittest.main()
|