mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-14 06:27:45 +08:00
doc: Add README for wide EP (#6356)
Signed-off-by: Kaiyu Xie <26294424+kaiyux@users.noreply.github.com>
This commit is contained in:
parent
64ba483656
commit
e58afa510e
@ -3,7 +3,7 @@
|
||||
By NVIDIA TensorRT-LLM Team
|
||||
|
||||
## Table of Contents
|
||||
- [Scaling Expert Parallelism in TensorRT-LLM (Part 1: Design and Implementation of Large-scale EP)](#scaling-expert-parallelism-in-tensorrt-llmpart-1-design-and-implementation-of-large-scale-ep)
|
||||
- [Scaling Expert Parallelism in TensorRT-LLM (Part 1: Design and Implementation of Large-scale EP)](#scaling-expert-parallelism-in-tensorrt-llm-part-1-design-and-implementation-of-large-scale-ep)
|
||||
- [Table of Contents](#table-of-contents)
|
||||
- [Motivation for large-scale EP](#motivation-for-large-scale-ep)
|
||||
- [Observations over one machine translation dataset](#observations-over-one-machine-translation-dataset)
|
||||
@ -15,8 +15,8 @@ By NVIDIA TensorRT-LLM Team
|
||||
- [EP Load Balancer](#ep-load-balancer)
|
||||
- [Python Interface](#python-interface)
|
||||
- [C++ extension](#c-extension)
|
||||
- [Core implementations of host side logics](#core-implementations-of-host-side-logics)
|
||||
- [Core implementations of GPU side logics](#core-implementations-of-gpu-side-logics)
|
||||
- [Core implementations of the host logic](#core-implementations-of-the-host-logic)
|
||||
- [Core implementations of the GPU logic](#core-implementations-of-the-gpu-logic)
|
||||
- [Online EP Load Balancer](#online-ep-load-balancer)
|
||||
- [Offline EP Load Balancer](#offline-ep-load-balancer)
|
||||
- [E2E evaluation](#e2e-evaluation)
|
||||
@ -516,7 +516,9 @@ Clearly in Figure 25, we can see that EPLB brings a clear performance improvemen
|
||||
|
||||
## Reproducing steps
|
||||
Currently to run through the reproducing steps described in this section, please, use this [feature branch](https://github.com/NVIDIA/TensorRT-LLM/tree/feat/large-ep/tensorrt_llm). It will get merged to the main branch soon.
|
||||
|
||||
### The effect of EP Load Balancer
|
||||
|
||||
Please, refer to the [EP Load Balancer example](https://github.com/NVIDIA/TensorRT-LLM/tree/feat/large-ep/examples/ep_load_balancer) for how to reproduce the results for the offline EP Load Balancer.
|
||||
|
||||
##### Step 1: Run inference and collect statistics
|
||||
|
||||
@ -173,14 +173,16 @@ def gen_config_file(config_path: str,
|
||||
'max_batch_size': ctx_batch_size,
|
||||
'max_num_tokens': ctx_max_num_tokens,
|
||||
'max_seq_len': 1152,
|
||||
'free_gpu_memory_fraction': 0.85,
|
||||
'tensor_parallel_size': ctx_tp_size,
|
||||
'moe_expert_parallel_size': ctx_tp_size,
|
||||
'enable_attention_dp': ctx_enable_attention_dp,
|
||||
'pipeline_parallel_size': 1,
|
||||
'print_iter_log': True,
|
||||
'disable_overlap_scheduler': True,
|
||||
'kv_cache_dtype': 'fp8',
|
||||
'kv_cache_config': {
|
||||
'free_gpu_memory_fraction': 0.85,
|
||||
'dtype': 'fp8',
|
||||
},
|
||||
'cache_transceiver_config': {
|
||||
'backend': 'default',
|
||||
'max_tokens_in_buffer': 8320,
|
||||
@ -195,14 +197,18 @@ def gen_config_file(config_path: str,
|
||||
'max_batch_size': gen_batch_size,
|
||||
'max_num_tokens': gen_max_num_tokens,
|
||||
'max_seq_len': 2176,
|
||||
'free_gpu_memory_fraction': gen_gpu_memory_fraction,
|
||||
'cuda_graph_config': {
|
||||
'enable_padding': True,
|
||||
'batch_sizes': gen_cuda_graph_batch_sizes,
|
||||
},
|
||||
'print_iter_log': True,
|
||||
'kv_cache_dtype': 'fp8',
|
||||
'moe_backend': gen_moe_backend,
|
||||
'kv_cache_config': {
|
||||
'free_gpu_memory_fraction': gen_gpu_memory_fraction,
|
||||
'dtype': 'fp8',
|
||||
},
|
||||
'moe_config': {
|
||||
'backend': gen_moe_backend,
|
||||
},
|
||||
'cache_transceiver_config': {
|
||||
'backend': 'default',
|
||||
'max_tokens_in_buffer': 8320,
|
||||
@ -242,8 +248,8 @@ def gen_config_file(config_path: str,
|
||||
f,
|
||||
default_flow_style=False,
|
||||
sort_keys=False)
|
||||
config['generation_servers'][
|
||||
'moe_load_balancer'] = moe_load_balancer_file
|
||||
config['generation_servers']['moe_config'][
|
||||
'load_balancer'] = moe_load_balancer_file
|
||||
|
||||
if mtp_size > 0:
|
||||
config['context_servers']['speculative_config'] = {
|
||||
|
||||
@ -1,8 +1,6 @@
|
||||
#!/bin/bash
|
||||
|
||||
# !!!
|
||||
# Make sure that SLURM parameters are correctly set in `disaggr_torch.slurm` before executing this script.
|
||||
# !!!
|
||||
echo "Make sure that SLURM parameters are correctly set in \`disaggr_torch.slurm\` before executing this script."
|
||||
|
||||
# concurrency 8
|
||||
concurrency=8
|
||||
|
||||
83
examples/wide_ep/README.md
Normal file
83
examples/wide_ep/README.md
Normal file
@ -0,0 +1,83 @@
|
||||
# Wide Expert Parallelism (Wide-EP) in TensorRT-LLM
|
||||
|
||||
TensorRT-LLM's Wide Expert Parallelism (Wide-EP) feature enables efficient inference of large-scale Mixture-of-Experts (MoE) models by scaling expert parallelism beyond traditional limits. This feature addresses the inherent workload imbalance challenges in large-scale MoE models and provides both offline and online load balancing capabilities.
|
||||
|
||||
## Overview
|
||||
|
||||
Large-scale MoE models like DeepSeek-V3/R1, LLaMA4, and Qwen3 use fine-grained expert designs that introduce new challenges for inference systems:
|
||||
|
||||
- **High memory demands** for expert weights
|
||||
- **Inherent expert-level workload imbalance** due to sparse execution patterns
|
||||
- **Communication overhead** in distributed expert parallelism
|
||||
|
||||
Wide-EP solves these challenges through:
|
||||
|
||||
- **Custom EP communication kernels** optimized for NVIDIA GB200 Multi-Node NVLink (MNNVL)
|
||||
- **Expert Parallelism Load Balancer (EPLB)** with both offline and online modes
|
||||
- **Dynamic expert placement and replication** strategies
|
||||
- **Layer-wise weight redistribution** to minimize inference disruption
|
||||
|
||||
## Quick Start
|
||||
|
||||
### 1. Configurations
|
||||
|
||||
An example yaml file to enable wide EP:
|
||||
```yaml
|
||||
moe_config:
|
||||
backend: WIDEEP
|
||||
max_num_tokens: 9216
|
||||
load_balancer: moe_load_balancer.yaml # (optional) enable load balancer
|
||||
```
|
||||
|
||||
| Parameter | Description | Default | Notes |
|
||||
|-----------|-------------|---------|-------|
|
||||
| `backend` | MoE backend type | `CUTLASS` | Set to `WIDEEP` to enable wide EP |
|
||||
| `max_num_tokens` | If set, at most max_num_tokens tokens will be sent to torch.ops.trtllm.fused_moe at the same time. | `None` | If the number of tokens exceeds max_num_tokens, the input tensors will be split into chunks and a for loop will be used. |
|
||||
| `load_balancer` | Configuration for MoE load balancing | `None` | Set path to the yaml file |
|
||||
|
||||
#### Load Balancer Configuration
|
||||
|
||||
An example `moe_load_balancer.yaml` file to configure online EP balancer:
|
||||
```yaml
|
||||
num_slots: 288
|
||||
layer_updates_per_iter: 1
|
||||
```
|
||||
|
||||
| Parameter | Description | Default | Notes |
|
||||
|-----------|-------------|---------|-------|
|
||||
| `num_slots` | Total number of expert slots | `None` | Must be ≥ total experts |
|
||||
| `layer_updates_per_iter` | Number of layers updated per iteration | `0` | `0` = offline, `>0` = online |
|
||||
|
||||
Refer to the [ep_load_balancer](./ep_load_balancer/) directory for more details on EP load balancer.
|
||||
|
||||
### 2. Execute Wide-EP on SLURM Clusters
|
||||
|
||||
Refer to the [slurm_scripts](./slurm_scripts/) directory, which reuses [disaggregated slurm scripts](../disaggregated/slurm/) to automatically generate configuration files and submit jobs to SLURM clusters.
|
||||
|
||||
## Trouble shooting
|
||||
|
||||
### Transparent HugePages failure
|
||||
|
||||
When getting exception `madvise(MADV_HUGEPAGE) failed.`, check if Transparent Hugepages has been enabled.
|
||||
```bash
|
||||
>$ cat /sys/kernel/mm/transparent_hugepage/enabled
|
||||
always [madvise] never
|
||||
>$ cat /sys/kernel/mm/transparent_hugepage/defrag
|
||||
always defer defer+madvise [madvise] never
|
||||
```
|
||||
If `never` is highlighted, enable Transparent HugePages by the following command.
|
||||
```bash
|
||||
echo madvise > /sys/kernel/mm/transparent_hugepage/enabled
|
||||
```
|
||||
|
||||
### Disaggregated serving related issues
|
||||
|
||||
Refer to the [Troubleshooting and FAQ](https://github.com/NVIDIA/TensorRT-LLM/blob/main/docs/source/advanced/disaggregated-service.md#troubleshooting-and-faq) section of Disaggregated-Service.
|
||||
|
||||
## References
|
||||
|
||||
- [Technical Blog: Scaling Expert Parallelism in TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM/blob/main/docs/source/blogs/tech_blog/blog4_Scaling_Expert_Parallelism_in_TensorRT-LLM.md)
|
||||
|
||||
For detailed implementation examples and advanced usage, see the subdirectories:
|
||||
- [`ep_load_balancer/`](ep_load_balancer/): Load balancing tools and examples
|
||||
- [`slurm_scripts/`](slurm_scripts/): Cluster deployment scripts
|
||||
@ -1,30 +1,19 @@
|
||||
#!/bin/bash
|
||||
|
||||
# !!!
|
||||
# Please find the `disaggr_torch.slurm` script in the `examples/disaggregated/slurm/` directory.
|
||||
# Make sure that SLURM parameters are correctly set in `disaggr_torch.slurm` before executing this script.
|
||||
# !!!
|
||||
echo "Please find the \`disaggr_torch.slurm\` script in the \`examples/disaggregated/slurm/\` directory."
|
||||
echo "Make sure that SLURM parameters are correctly set in \`disaggr_torch.slurm\` before executing this script."
|
||||
|
||||
mtp_size=0
|
||||
ntasks_per_node=4 # 4 GPUs per GB200 node
|
||||
|
||||
# dep8
|
||||
for b in 1 64 1024; do
|
||||
concurrency=$((b * 8))
|
||||
ctx_num=$(((concurrency + 5499)/5500))
|
||||
total_node_num=$((ctx_num + 2))
|
||||
ntasks=$((total_node_num * ntasks_per_node))
|
||||
sbatch --nodes=${total_node_num} --ntasks=${ntasks} --ntasks-per-node=${ntasks_per_node} --segment=${total_node_num} disaggr_torch.slurm ${ctx_num} 4 4 4480 true 1 8 1024 1024 true "0.8" 0 "$mtp_size" "$concurrency"
|
||||
done
|
||||
|
||||
# dep16 eplb0, 256, 288
|
||||
for b in 1 64 1024; do
|
||||
concurrency=$((b * 16))
|
||||
ctx_num=$(((concurrency + 5499)/5500))
|
||||
total_node_num=$((ctx_num + 4))
|
||||
ntasks=$((total_node_num * ntasks_per_node))
|
||||
sbatch --nodes=${total_node_num} --ntasks=${ntasks} --ntasks-per-node=${ntasks_per_node} --segment=${total_node_num} disaggr_torch.slurm ${ctx_num} 4 4 4480 true 1 16 1024 1024 true "0.7" 0 "$mtp_size" "$concurrency"
|
||||
sbatch --nodes=${total_node_num} --ntasks=${ntasks} --ntasks-per-node=${ntasks_per_node} --segment=${total_node_num} disaggr_torch.slurm ${ctx_num} 4 4 4480 true 1 16 1024 1024 true "0.7" 256 "$mtp_size" "$concurrency"
|
||||
# sbatch --nodes=${total_node_num} --ntasks=${ntasks} --ntasks-per-node=${ntasks_per_node} --segment=${total_node_num} disaggr_torch.slurm ${ctx_num} 4 4 4480 true 1 16 1024 1024 true "0.7" 0 "$mtp_size" "$concurrency"
|
||||
# sbatch --nodes=${total_node_num} --ntasks=${ntasks} --ntasks-per-node=${ntasks_per_node} --segment=${total_node_num} disaggr_torch.slurm ${ctx_num} 4 4 4480 true 1 16 1024 1024 true "0.7" 256 "$mtp_size" "$concurrency"
|
||||
sbatch --nodes=${total_node_num} --ntasks=${ntasks} --ntasks-per-node=${ntasks_per_node} --segment=${total_node_num} disaggr_torch.slurm ${ctx_num} 4 4 4480 true 1 16 1024 1024 true "0.7" 288 "$mtp_size" "$concurrency"
|
||||
done
|
||||
|
||||
|
||||
Loading…
Reference in New Issue
Block a user