doc: Add README for wide EP (#6356)

Signed-off-by: Kaiyu Xie <26294424+kaiyux@users.noreply.github.com>
2026-01-14 06:27:45 +08:00 · 2025-07-29 12:36:12 +08:00 · 2025-07-29 12:36:12 +08:00 · e58afa510e
commit e58afa510e
parent 64ba483656
5 changed files with 106 additions and 28 deletions
--- a/docs/source/blogs/tech_blog/blog4_Scaling_Expert_Parallelism_in_TensorRT-LLM.md
+++ b/docs/source/blogs/tech_blog/blog4_Scaling_Expert_Parallelism_in_TensorRT-LLM.md
@ -3,7 +3,7 @@
 By NVIDIA TensorRT-LLM Team

 ## Table of Contents
- [Scaling Expert Parallelism in TensorRT-LLM (Part 1: Design and Implementation of Large-scale EP)](#scaling-expert-parallelism-in-tensorrt-llmpart-1-design-and-implementation-of-large-scale-ep)
+- [Scaling Expert Parallelism in TensorRT-LLM (Part 1: Design and Implementation of Large-scale EP)](#scaling-expert-parallelism-in-tensorrt-llm-part-1-design-and-implementation-of-large-scale-ep)
  - [Table of Contents](#table-of-contents)
  - [Motivation for large-scale EP](#motivation-for-large-scale-ep)
    - [Observations over one machine translation dataset](#observations-over-one-machine-translation-dataset)
@ -15,8 +15,8 @@ By NVIDIA TensorRT-LLM Team
  - [EP Load Balancer](#ep-load-balancer)
    - [Python Interface](#python-interface)
    - [C++ extension](#c-extension)
-    - [Core implementations of host side logics](#core-implementations-of-host-side-logics)
-    - [Core implementations of GPU side logics](#core-implementations-of-gpu-side-logics)
+    - [Core implementations of the host logic](#core-implementations-of-the-host-logic)
+    - [Core implementations of the GPU logic](#core-implementations-of-the-gpu-logic)
    - [Online EP Load Balancer](#online-ep-load-balancer)
    - [Offline EP Load Balancer](#offline-ep-load-balancer)
  - [E2E evaluation](#e2e-evaluation)
@ -516,7 +516,9 @@ Clearly in Figure 25, we can see that EPLB brings a clear performance improvemen

 ## Reproducing steps
 Currently to run through the reproducing steps described in this section, please, use this [feature branch](https://github.com/NVIDIA/TensorRT-LLM/tree/feat/large-ep/tensorrt_llm). It will get merged to the main branch soon.
+
 ### The effect of EP Load Balancer
+
 Please, refer to the [EP Load Balancer example](https://github.com/NVIDIA/TensorRT-LLM/tree/feat/large-ep/examples/ep_load_balancer) for how to reproduce the results for the offline EP Load Balancer.

 ##### Step 1: Run inference and collect statistics
--- a/examples/disaggregated/slurm/gen_yaml.py
+++ b/examples/disaggregated/slurm/gen_yaml.py
@ -173,14 +173,16 @@ def gen_config_file(config_path: str,
            'max_batch_size': ctx_batch_size,
            'max_num_tokens': ctx_max_num_tokens,
            'max_seq_len': 1152,
-            'free_gpu_memory_fraction': 0.85,
            'tensor_parallel_size': ctx_tp_size,
            'moe_expert_parallel_size': ctx_tp_size,
            'enable_attention_dp': ctx_enable_attention_dp,
            'pipeline_parallel_size': 1,
            'print_iter_log': True,
            'disable_overlap_scheduler': True,
-            'kv_cache_dtype': 'fp8',
+            'kv_cache_config': {
+                'free_gpu_memory_fraction': 0.85,
+                'dtype': 'fp8',
+            },
            'cache_transceiver_config': {
                'backend': 'default',
                'max_tokens_in_buffer': 8320,
@ -195,14 +197,18 @@ def gen_config_file(config_path: str,
            'max_batch_size': gen_batch_size,
            'max_num_tokens': gen_max_num_tokens,
            'max_seq_len': 2176,
-            'free_gpu_memory_fraction': gen_gpu_memory_fraction,
            'cuda_graph_config': {
                'enable_padding': True,
                'batch_sizes': gen_cuda_graph_batch_sizes,
            },
            'print_iter_log': True,
-            'kv_cache_dtype': 'fp8',
-            'moe_backend': gen_moe_backend,
+            'kv_cache_config': {
+                'free_gpu_memory_fraction': gen_gpu_memory_fraction,
+                'dtype': 'fp8',
+            },
+            'moe_config': {
+                'backend': gen_moe_backend,
+            },
            'cache_transceiver_config': {
                'backend': 'default',
                'max_tokens_in_buffer': 8320,
@ -242,8 +248,8 @@ def gen_config_file(config_path: str,
                      f,
                      default_flow_style=False,
                      sort_keys=False)
-        config['generation_servers'][
-            'moe_load_balancer'] = moe_load_balancer_file
+        config['generation_servers']['moe_config'][
+            'load_balancer'] = moe_load_balancer_file

    if mtp_size > 0:
        config['context_servers']['speculative_config'] = {
--- a/examples/disaggregated/slurm/submit.sh
+++ b/examples/disaggregated/slurm/submit.sh
@ -1,8 +1,6 @@
 #!/bin/bash

-# !!!
-# Make sure that SLURM parameters are correctly set in `disaggr_torch.slurm` before executing this script.
-# !!!
+echo "Make sure that SLURM parameters are correctly set in \`disaggr_torch.slurm\` before executing this script."

 # concurrency 8
 concurrency=8
--- a/examples/wide_ep/README.md
+++ b/examples/wide_ep/README.md
@ -0,0 +1,83 @@
+# Wide Expert Parallelism (Wide-EP) in TensorRT-LLM
+
+TensorRT-LLM's Wide Expert Parallelism (Wide-EP) feature enables efficient inference of large-scale Mixture-of-Experts (MoE) models by scaling expert parallelism beyond traditional limits. This feature addresses the inherent workload imbalance challenges in large-scale MoE models and provides both offline and online load balancing capabilities.
+
+## Overview
+
+Large-scale MoE models like DeepSeek-V3/R1, LLaMA4, and Qwen3 use fine-grained expert designs that introduce new challenges for inference systems:
+
+- **High memory demands** for expert weights
+- **Inherent expert-level workload imbalance** due to sparse execution patterns
+- **Communication overhead** in distributed expert parallelism
+
+Wide-EP solves these challenges through:
+
+- **Custom EP communication kernels** optimized for NVIDIA GB200 Multi-Node NVLink (MNNVL)
+- **Expert Parallelism Load Balancer (EPLB)** with both offline and online modes
+- **Dynamic expert placement and replication** strategies
+- **Layer-wise weight redistribution** to minimize inference disruption
+
+## Quick Start
+
+### 1. Configurations
+
+An example yaml file to enable wide EP:
+```yaml
+moe_config:
+    backend: WIDEEP
+    max_num_tokens: 9216
+    load_balancer: moe_load_balancer.yaml # (optional) enable load balancer
+```
+
+| Parameter | Description | Default | Notes |
+|-----------|-------------|---------|-------|
+| `backend` | MoE backend type | `CUTLASS` | Set to `WIDEEP` to enable wide EP |
+| `max_num_tokens` | If set, at most max_num_tokens tokens will be sent to torch.ops.trtllm.fused_moe at the same time.  | `None` | If the number of tokens exceeds max_num_tokens, the input tensors will be split into chunks and a for loop will be used. |
+| `load_balancer` | Configuration for MoE load balancing | `None` | Set path to the yaml file |
+
+#### Load Balancer Configuration
+
+An example `moe_load_balancer.yaml` file to configure online EP balancer:
+```yaml
+num_slots: 288
+layer_updates_per_iter: 1
+```
+
+| Parameter | Description | Default | Notes |
+|-----------|-------------|---------|-------|
+| `num_slots` | Total number of expert slots | `None` | Must be ≥ total experts |
+| `layer_updates_per_iter` | Number of layers updated per iteration | `0` | `0` = offline, `>0` = online |
+
+Refer to the [ep_load_balancer](./ep_load_balancer/) directory for more details on EP load balancer.
+
+### 2. Execute Wide-EP on SLURM Clusters
+
+Refer to the [slurm_scripts](./slurm_scripts/) directory, which reuses [disaggregated slurm scripts](../disaggregated/slurm/) to automatically generate configuration files and submit jobs to SLURM clusters.
+
+## Trouble shooting
+
+### Transparent HugePages failure
+
+When getting exception `madvise(MADV_HUGEPAGE) failed.`, check if Transparent Hugepages has been enabled.
+```bash
+>$ cat /sys/kernel/mm/transparent_hugepage/enabled
+always [madvise] never
+>$ cat /sys/kernel/mm/transparent_hugepage/defrag
+always defer defer+madvise [madvise] never
+```
+If `never` is highlighted, enable Transparent HugePages by the following command.
+```bash
+echo madvise > /sys/kernel/mm/transparent_hugepage/enabled
+```
+
+### Disaggregated serving related issues
+
+Refer to the [Troubleshooting and FAQ](https://github.com/NVIDIA/TensorRT-LLM/blob/main/docs/source/advanced/disaggregated-service.md#troubleshooting-and-faq) section of Disaggregated-Service.
+
+## References
+
+- [Technical Blog: Scaling Expert Parallelism in TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM/blob/main/docs/source/blogs/tech_blog/blog4_Scaling_Expert_Parallelism_in_TensorRT-LLM.md)
+
+For detailed implementation examples and advanced usage, see the subdirectories:
+- [`ep_load_balancer/`](ep_load_balancer/): Load balancing tools and examples
+- [`slurm_scripts/`](slurm_scripts/): Cluster deployment scripts
--- a/examples/wide_ep/slurm_scripts/submit.sh
+++ b/examples/wide_ep/slurm_scripts/submit.sh
@ -1,30 +1,19 @@
 #!/bin/bash

-# !!!
-# Please find the `disaggr_torch.slurm` script in the `examples/disaggregated/slurm/` directory.
-# Make sure that SLURM parameters are correctly set in `disaggr_torch.slurm` before executing this script.
-# !!!
+echo "Please find the \`disaggr_torch.slurm\` script in the \`examples/disaggregated/slurm/\` directory."
+echo "Make sure that SLURM parameters are correctly set in \`disaggr_torch.slurm\` before executing this script."

 mtp_size=0
 ntasks_per_node=4 # 4 GPUs per GB200 node

-# dep8
-for b in 1 64 1024; do
-    concurrency=$((b * 8))
-    ctx_num=$(((concurrency + 5499)/5500))
-    total_node_num=$((ctx_num + 2))
-    ntasks=$((total_node_num * ntasks_per_node))
-    sbatch --nodes=${total_node_num} --ntasks=${ntasks} --ntasks-per-node=${ntasks_per_node} --segment=${total_node_num} disaggr_torch.slurm ${ctx_num} 4 4 4480 true 1 8 1024 1024 true "0.8" 0 "$mtp_size" "$concurrency"
-done
-
 # dep16 eplb0, 256, 288
 for b in 1 64 1024; do
    concurrency=$((b * 16))
    ctx_num=$(((concurrency + 5499)/5500))
    total_node_num=$((ctx_num + 4))
    ntasks=$((total_node_num * ntasks_per_node))
-    sbatch --nodes=${total_node_num} --ntasks=${ntasks} --ntasks-per-node=${ntasks_per_node} --segment=${total_node_num} disaggr_torch.slurm ${ctx_num} 4 4 4480 true 1 16 1024 1024 true "0.7" 0 "$mtp_size" "$concurrency"
-    sbatch --nodes=${total_node_num} --ntasks=${ntasks} --ntasks-per-node=${ntasks_per_node} --segment=${total_node_num} disaggr_torch.slurm ${ctx_num} 4 4 4480 true 1 16 1024 1024 true "0.7" 256 "$mtp_size" "$concurrency"
+    # sbatch --nodes=${total_node_num} --ntasks=${ntasks} --ntasks-per-node=${ntasks_per_node} --segment=${total_node_num} disaggr_torch.slurm ${ctx_num} 4 4 4480 true 1 16 1024 1024 true "0.7" 0 "$mtp_size" "$concurrency"
+    # sbatch --nodes=${total_node_num} --ntasks=${ntasks} --ntasks-per-node=${ntasks_per_node} --segment=${total_node_num} disaggr_torch.slurm ${ctx_num} 4 4 4480 true 1 16 1024 1024 true "0.7" 256 "$mtp_size" "$concurrency"
    sbatch --nodes=${total_node_num} --ntasks=${ntasks} --ntasks-per-node=${ntasks_per_node} --segment=${total_node_num} disaggr_torch.slurm ${ctx_num} 4 4 4480 true 1 16 1024 1024 true "0.7" 288 "$mtp_size" "$concurrency"
 done