From 7c6c49399361e025c8a68474f82f2a20ba5aa1d8 Mon Sep 17 00:00:00 2001 From: Emma Qiao Date: Sun, 7 Dec 2025 22:26:47 +0800 Subject: [PATCH 01/10] [None][infra] Waive failed cases for main branch on 12/07 (#9769) Signed-off-by: qqiao --- tests/integration/test_lists/waives.txt | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt index 8ee57361c3..2d7bfd20d7 100644 --- a/tests/integration/test_lists/waives.txt +++ b/tests/integration/test_lists/waives.txt @@ -432,3 +432,8 @@ disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_nix disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_overlap_cuda_graph[DeepSeek-V3-Lite-fp8] SKIP (https://nvbugs/5719561) disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_ucx[DeepSeek-V3-Lite-fp8] SKIP (https://nvbugs/5719561) disaggregated/test_workers.py::test_workers_kv_cache_aware_router_deepseek_v3_lite_bf16[DeepSeek-V3-Lite-bf16] SKIP (https://nvbugs/5719561) +accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=TRTLLM-mtp_nextn=0-ep4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] SKIP (https://nvbugs/5721661) +accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput_mtp] SKIP (https://nvbugs/5715568) +accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm] SKIP (https://nvbugs/5721672) +unittest/_torch/modules/test_fused_moe.py::test_fused_moe_w4a8_nvfp4_fp8[CUTLASS] SKIP (https://nvbugs/5721912) +unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/test_flashinfer_attention_op.py::test_flashinfer_attention_op_context_input_pos[cuda-dtype0-4-8-seq6] SKIP (https://nvbugs/5721907) From f59d64e6c7af038141b77df1d67d37caf53fcb6b Mon Sep 17 00:00:00 2001 From: Yanchao Lu Date: Sun, 7 Dec 2025 23:07:59 +0800 Subject: [PATCH 02/10] [None][fix] Several minor fixes to CI setting (#9765) Signed-off-by: Yanchao Lu --- jenkins/L0_Test.groovy | 10 +++++++--- jenkins/scripts/slurm_run.sh | 7 +++++-- scripts/check_test_list.py | 7 +++---- 3 files changed, 15 insertions(+), 9 deletions(-) diff --git a/jenkins/L0_Test.groovy b/jenkins/L0_Test.groovy index fe4434a86c..41c66a7887 100644 --- a/jenkins/L0_Test.groovy +++ b/jenkins/L0_Test.groovy @@ -1639,6 +1639,7 @@ def launchTestListCheck(pipeline) sh "tar -zxf ${tarName}" def llmPath = sh (script: "realpath .", returnStdout: true).trim() def llmSrc = "${llmPath}/TensorRT-LLM/src" + trtllm_utils.llmExecStepWithRetry(pipeline, script: "pip3 install -r ${llmSrc}/requirements-dev.txt") sh "NVIDIA_TRITON_SERVER_VERSION=25.10 LLM_ROOT=${llmSrc} LLM_BACKEND_ROOT=${llmSrc}/triton_backend python3 ${llmSrc}/scripts/check_test_list.py --l0 --qa --waive" } catch (InterruptedException e) { throw e @@ -2903,8 +2904,10 @@ def launchTestJobs(pipeline, testFilter) "DGX_B200-4_GPUs-PyTorch-2": ["b200-x4", "l0_dgx_b200", 2, 2, 4], "DGX_B200-4_GPUs-PyTorch-Ray-1": ["b200-x4", "l0_dgx_b200", 1, 1, 4], "DGX_B200-8_GPUs-PyTorch-1": ["b200-x8", "l0_dgx_b200", 1, 1, 8], - "DGX_B200-4_GPUs-PyTorch-Post-Merge-1": ["b200-trtllm", "l0_dgx_b200", 1, 1, 4, 1, true], - "DGX_B300-4_GPUs-PyTorch-Post-Merge-1": ["b300-x4", "l0_dgx_b300", 1, 1, 4], + "DGX_B200-4_GPUs-PyTorch-Post-Merge-1": ["b200-trtllm", "l0_dgx_b200", 1, 2, 4, 1, true], + "DGX_B200-4_GPUs-PyTorch-Post-Merge-2": ["b200-trtllm", "l0_dgx_b200", 2, 2, 4, 1, true], + "DGX_B300-4_GPUs-PyTorch-Post-Merge-1": ["b300-x4", "l0_dgx_b300", 1, 2, 4], + "DGX_B300-4_GPUs-PyTorch-Post-Merge-2": ["b300-x4", "l0_dgx_b300", 2, 2, 4], // Perf sanity post merge test // Disable perf stages due to https://nvbugs/5643646 // "DGX_B200-4_GPUs-PyTorch-Perf-Sanity-Post-Merge-1": ["b200-x4", "perf_sanity_l0_dgx_b200", 1, 1, 4], @@ -2933,7 +2936,8 @@ def launchTestJobs(pipeline, testFilter) fullSet += SBSATestConfigs.keySet() SBSASlurmTestConfigs = [ - "GB200-4_GPUs-PyTorch-1": ["gb200-x4-oci", "l0_gb200_multi_gpus", 1, 1, 4], + "GB200-4_GPUs-PyTorch-1": ["gb200-x4-oci", "l0_gb200_multi_gpus", 1, 2, 4], + "GB200-4_GPUs-PyTorch-2": ["gb200-x4-oci", "l0_gb200_multi_gpus", 2, 2, 4], "GB200-4_GPUs-PyTorch-Post-Merge-1": ["gb200-x4-oci", "l0_gb200_multi_gpus", 1, 1, 4], // Disable GB300 stages due to nodes will be offline temporarily. // "GB300-PyTorch-1": ["gb300-single", "l0_gb300", 1, 1], diff --git a/jenkins/scripts/slurm_run.sh b/jenkins/scripts/slurm_run.sh index 49368b94c0..8f191b3edb 100755 --- a/jenkins/scripts/slurm_run.sh +++ b/jenkins/scripts/slurm_run.sh @@ -29,10 +29,14 @@ set_value_in_command() { echo "$result" } -# Only the first process will save the job ID +# Only the first process will save the job ID and set the git config if [ $SLURM_PROCID -eq 0 ]; then # Save job ID in $jobWorkspace/slurm_job_id.txt for later job to retrieve echo $SLURM_JOB_ID > $jobWorkspace/slurm_job_id.txt + # Update HOME/.gitconfig + if ! git config --global --get-all safe.directory | grep -Fxq "*"; then + git config --global --add safe.directory "*" + fi fi if [ $SLURM_LOCALID -eq 0 ]; then @@ -47,7 +51,6 @@ if [ $SLURM_LOCALID -eq 0 ]; then fi cd $llmSrcNode && pip3 install --retries 10 -r requirements-dev.txt cd $resourcePathNode && pip3 install --retries 10 --force-reinstall --no-deps TensorRT-LLM/tensorrt_llm-*.whl - git config --global --add safe.directory "*" gpuUuids=$(nvidia-smi -q | grep "GPU UUID" | awk '{print $4}' | tr '\n' ',' || true) hostNodeName="${HOST_NODE_NAME:-$(hostname -f || hostname)}" echo "HOST_NODE_NAME = $hostNodeName ; GPU_UUIDS = $gpuUuids ; STAGE_NAME = $stageName" diff --git a/scripts/check_test_list.py b/scripts/check_test_list.py index c7b5357d25..c799d433fc 100755 --- a/scripts/check_test_list.py +++ b/scripts/check_test_list.py @@ -23,10 +23,9 @@ MARKER_LIST_IN_TEST = [" TIMEOUT"] def install_python_dependencies(llm_src): - subprocess.run( - f"cd {llm_src} && pip3 install --retries 1 -r requirements-dev.txt", - shell=True, - check=True) + subprocess.run(f"cd {llm_src} && pip3 install -r requirements-dev.txt", + shell=True, + check=True) subprocess.run( f"pip3 install --force-reinstall --no-deps {llm_src}/../tensorrt_llm-*.whl", shell=True, From d252101a769e730907b13f559844e28d4b6fcdcd Mon Sep 17 00:00:00 2001 From: Chenjie Luo <108829653+cjluo-nv@users.noreply.github.com> Date: Sun, 7 Dec 2025 07:14:05 -0800 Subject: [PATCH 03/10] [OMNIML-3036][doc] Re-branding TensorRT-Model-Optimizer as Nvidia Model-Optimizer (#9679) Signed-off-by: Chenjie Luo --- ATTRIBUTIONS-Python.md | 4 ++-- README.md | 4 ++-- ...ing_Expert_Parallelism_in_TensorRT-LLM_part3.md | 2 +- ..._DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.md | 2 +- ...pSeek_R1_Throughput_on_NVIDIA_Blackwell_GPUs.md | 2 +- docs/source/developer-guide/perf-benchmarking.md | 4 ++-- docs/source/developer-guide/perf-overview.md | 2 +- docs/source/features/auto_deploy/support_matrix.md | 2 +- docs/source/features/quantization.md | 8 ++++---- .../source/legacy/performance/perf-benchmarking.md | 2 +- docs/source/torch/auto_deploy/support_matrix.md | 2 +- docs/source/torch/features/quantization.md | 6 +++--- examples/auto_deploy/README.md | 8 ++++---- examples/disaggregated/README.md | 2 +- .../_tensorrt_engine/llm_medusa_decoding.py | 4 ++-- .../llm-api/_tensorrt_engine/quickstart_example.py | 2 +- examples/llm-api/llm_inference.py | 2 +- examples/llm-api/quickstart_example.py | 2 +- examples/medusa/README.md | 2 +- examples/models/core/deepseek_v3/README.md | 6 +++--- examples/models/core/exaone/README.md | 10 +++++----- examples/models/core/llama/README.md | 2 +- examples/models/core/llama4/README.md | 6 +++--- examples/models/core/qwen/README.md | 14 +++++++------- examples/quantization/README.md | 2 +- .../examples/models/core/mllama/poetry.lock | 2 +- security_scanning/poetry.lock | 2 +- 27 files changed, 53 insertions(+), 53 deletions(-) diff --git a/ATTRIBUTIONS-Python.md b/ATTRIBUTIONS-Python.md index f7360a7e93..4e350512a2 100644 --- a/ATTRIBUTIONS-Python.md +++ b/ATTRIBUTIONS-Python.md @@ -25486,7 +25486,7 @@ limitations under the License. ``` ### URLs - - `Homepage`: https://github.com/NVIDIA/TensorRT-Model-Optimizer + - `Homepage`: https://github.com/NVIDIA/Model-Optimizer ## nvidia-modelopt-core (0.33.1) @@ -25513,7 +25513,7 @@ limitations under the License. ``` ### URLs - - `Homepage`: https://github.com/NVIDIA/TensorRT-Model-Optimizer + - `Homepage`: https://github.com/NVIDIA/Model-Optimizer ## nvidia-nccl-cu12 (2.27.3) diff --git a/README.md b/README.md index f09c61783d..208767b037 100644 --- a/README.md +++ b/README.md @@ -164,7 +164,7 @@ state-of-the-art optimizations to perform inference efficiently on NVIDIA GPUs.< [➡️ link](https://www.bentoml.com/blog/tuning-tensor-rt-llm-for-optimal-serving-with-bentoml) -* [2024/08/20] 🏎️SDXL with #TensorRT Model Optimizer ⏱️⚡ 🏁 cache diffusion 🏁 quantization aware training 🏁 QLoRA 🏁 #Python 3.12 +* [2024/08/20] 🏎️SDXL with #Model Optimizer ⏱️⚡ 🏁 cache diffusion 🏁 quantization aware training 🏁 QLoRA 🏁 #Python 3.12 [➡️ link](https://developer.nvidia.com/blog/nvidia-tensorrt-model-optimizer-v0-15-boosts-inference-performance-and-expands-model-support/) * [2024/08/13] 🐍 DIY Code Completion with #Mamba ⚡ #TensorRT #LLM for speed 🤖 NIM for ease ☁️ deploy anywhere @@ -209,7 +209,7 @@ Technical Deep Dive for serious coders ✅+99% compression ✅1 set of weights * [2024/05/21] ✨@modal_labs has the codes for serverless @AIatMeta Llama 3 on #TensorRT #LLM ✨👀 📚 Marvelous Modal Manual: Serverless TensorRT LLM (LLaMA 3 8B) | Modal Docs [➡️ link](https://modal.com/docs/examples/trtllm_llama) -* [2024/05/08] NVIDIA TensorRT Model Optimizer -- the newest member of the #TensorRT ecosystem is a library of post-training and training-in-the-loop model optimization techniques ✅quantization ✅sparsity ✅QAT [➡️ blog](https://developer.nvidia.com/blog/accelerate-generative-ai-inference-performance-with-nvidia-tensorrt-model-optimizer-now-publicly-available/) +* [2024/05/08] NVIDIA Model Optimizer -- the newest member of the #TensorRT ecosystem is a library of post-training and training-in-the-loop model optimization techniques ✅quantization ✅sparsity ✅QAT [➡️ blog](https://developer.nvidia.com/blog/accelerate-generative-ai-inference-performance-with-nvidia-tensorrt-model-optimizer-now-publicly-available/) * [2024/05/07] 🦙🦙🦙 24,000 tokens per second 🛫Meta Llama 3 takes off with #TensorRT #LLM 📚[➡️ link](https://blogs.nvidia.com/blog/meta-llama3-inference-acceleration/) diff --git a/docs/source/blogs/tech_blog/blog14_Scaling_Expert_Parallelism_in_TensorRT-LLM_part3.md b/docs/source/blogs/tech_blog/blog14_Scaling_Expert_Parallelism_in_TensorRT-LLM_part3.md index 4b80603e29..800c406bd2 100644 --- a/docs/source/blogs/tech_blog/blog14_Scaling_Expert_Parallelism_in_TensorRT-LLM_part3.md +++ b/docs/source/blogs/tech_blog/blog14_Scaling_Expert_Parallelism_in_TensorRT-LLM_part3.md @@ -46,7 +46,7 @@ In this third blog of our scaling Expert Parallelism (EP) series, we push the pe The wo GEMM is the final linear layer within the multi-head attention block that produces the final outputs. While DeepSeek R1's MLA modifies the initial projections for keys and values, the wo GEMM operator remains a critical and standard component for finalizing the attention computation. In the term, "wo" is the abbreviation for the weight matrix for the output. -We've evaluated that quantizing the wo GEMM to FP4 still satisfies the accuracy requirements, maintaining a similar MTP accept rate (AR) while improving end-to-end performance. The [NVIDIA TensorRT Model Optimizer](https://github.com/NVIDIA/TensorRT-Model-Optimizer) team has published checkpoints that additionally quantize the wo module in attention layers to FP4 on HuggingFace: +We've evaluated that quantizing the wo GEMM to FP4 still satisfies the accuracy requirements, maintaining a similar MTP accept rate (AR) while improving end-to-end performance. The [NVIDIA Model Optimizer](https://github.com/NVIDIA/Model-Optimizer) team has published checkpoints that additionally quantize the wo module in attention layers to FP4 on HuggingFace: * https://huggingface.co/nvidia/DeepSeek-R1-FP4-v2 * https://huggingface.co/nvidia/DeepSeek-R1-0528-FP4-v2 diff --git a/docs/source/blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.md b/docs/source/blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.md index cd55d049d4..b5e3e6558a 100644 --- a/docs/source/blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.md +++ b/docs/source/blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.md @@ -67,7 +67,7 @@ We have explored a mixed precision recipe, which provides a better tradeoff betw *TensorRT LLM already supports [FP8 Attention](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/models/core/deepseek_v3#fp8-kv-cache-and-mla) while for this latency scenario low-precision attention computation doesn't help with performance so we choose to use bf16 precision for the Attention Modules. -** nvfp4 model checkpoint is generated by the [NVIDIA TensorRT Model Optimizer toolkit](https://github.com/NVIDIA/TensorRT-Model-Optimizer). +** nvfp4 model checkpoint is generated by the [NVIDIA Model Optimizer toolkit](https://github.com/NVIDIA/Model-Optimizer). *** RouterGEMM uses bf16 inputs/weights with fp32 outputs for numerical stability diff --git a/docs/source/blogs/tech_blog/blog3_Optimizing_DeepSeek_R1_Throughput_on_NVIDIA_Blackwell_GPUs.md b/docs/source/blogs/tech_blog/blog3_Optimizing_DeepSeek_R1_Throughput_on_NVIDIA_Blackwell_GPUs.md index 2da07411a8..d2483af3f3 100644 --- a/docs/source/blogs/tech_blog/blog3_Optimizing_DeepSeek_R1_Throughput_on_NVIDIA_Blackwell_GPUs.md +++ b/docs/source/blogs/tech_blog/blog3_Optimizing_DeepSeek_R1_Throughput_on_NVIDIA_Blackwell_GPUs.md @@ -29,7 +29,7 @@ The mixed precision recipe for DeepSeek R1 throughput scenario is almost the sam * FP8 KV cache and FP8 attention, rather than BF16 precision. * FP4 Allgather for better communication bandwidth utilization. -The checkpoint used in this blog is hosted in [nvidia/DeepSeek-R1-FP4](https://huggingface.co/nvidia/DeepSeek-R1-FP4), generated by [NVIDIA Model Optimizer](https://github.com/NVIDIA/TensorRT-Model-Optimizer). The accuracy score of common dataset on this FP4 checkpoint and TensorRT LLM implementations are: +The checkpoint used in this blog is hosted in [nvidia/DeepSeek-R1-FP4](https://huggingface.co/nvidia/DeepSeek-R1-FP4), generated by [NVIDIA Model Optimizer](https://github.com/NVIDIA/Model-Optimizer). The accuracy score of common dataset on this FP4 checkpoint and TensorRT LLM implementations are: | Precision | GPQA Diamond | MATH-500 | :-- | :-- | :-- | diff --git a/docs/source/developer-guide/perf-benchmarking.md b/docs/source/developer-guide/perf-benchmarking.md index 4e4e3ca421..57ef00d8f6 100644 --- a/docs/source/developer-guide/perf-benchmarking.md +++ b/docs/source/developer-guide/perf-benchmarking.md @@ -423,10 +423,10 @@ checkpoint. For the Llama-3.1 models, TensorRT LLM provides the following checkp - [`nvidia/Llama-3.1-70B-Instruct-FP8`](https://huggingface.co/nvidia/Llama-3.1-70B-Instruct-FP8) - [`nvidia/Llama-3.1-405B-Instruct-FP8`](https://huggingface.co/nvidia/Llama-3.1-405B-Instruct-FP8) -To understand more about how to quantize your own checkpoints, refer to ModelOpt [documentation](https://nvidia.github.io/TensorRT-Model-Optimizer/deployment/1_tensorrt_llm.html). +To understand more about how to quantize your own checkpoints, refer to ModelOpt [documentation](https://nvidia.github.io/Model-Optimizer/deployment/1_tensorrt_llm.html). `trtllm-bench` utilizes the `hf_quant_config.json` file present in the pre-quantized checkpoints above. The configuration -file is present in checkpoints quantized with [TensorRT Model Optimizer](https://github.com/NVIDIA/TensorRT-Model-Optimizer) +file is present in checkpoints quantized with [Model Optimizer](https://github.com/NVIDIA/Model-Optimizer) and describes the compute and KV cache quantization that checkpoint was compiled with. For example, from the checkpoints above: diff --git a/docs/source/developer-guide/perf-overview.md b/docs/source/developer-guide/perf-overview.md index 0a144a58d4..aefa91fd43 100644 --- a/docs/source/developer-guide/perf-overview.md +++ b/docs/source/developer-guide/perf-overview.md @@ -21,7 +21,7 @@ and shows the throughput scenario under maximum load. The reported metric is `To The performance numbers below were collected using the steps described in this document. -Testing was performed on models with weights quantized using [ModelOpt](https://nvidia.github.io/TensorRT-Model-Optimizer/#) and published by NVIDIA on the [Model Optimizer HuggingFace Collection](https://huggingface.co/collections/nvidia/model-optimizer-66aa84f7966b3150262481a4). +Testing was performed on models with weights quantized using [ModelOpt](https://nvidia.github.io/Model-Optimizer/#) and published by NVIDIA on the [Model Optimizer HuggingFace Collection](https://huggingface.co/collections/nvidia/model-optimizer-66aa84f7966b3150262481a4). *(NEW for v1.0) RTX 6000 Pro Blackwell Server Edition Benchmarks:* diff --git a/docs/source/features/auto_deploy/support_matrix.md b/docs/source/features/auto_deploy/support_matrix.md index 26c07b308b..fec6d841af 100644 --- a/docs/source/features/auto_deploy/support_matrix.md +++ b/docs/source/features/auto_deploy/support_matrix.md @@ -120,7 +120,7 @@ Optimize attention operations with different attention kernel implementations: ### Precision Support -AutoDeploy supports models with various precision formats, including quantized checkpoints generated by [`TensorRT-Model-Optimizer`](https://github.com/NVIDIA/TensorRT-Model-Optimizer). +AutoDeploy supports models with various precision formats, including quantized checkpoints generated by [`Model-Optimizer`](https://github.com/NVIDIA/Model-Optimizer). **Supported precision types include:** diff --git a/docs/source/features/quantization.md b/docs/source/features/quantization.md index 8a0e160529..e057a91b39 100644 --- a/docs/source/features/quantization.md +++ b/docs/source/features/quantization.md @@ -23,7 +23,7 @@ The default PyTorch backend supports FP4 and FP8 quantization on the latest Blac ### Running Pre-quantized Models -TensorRT LLM can directly run [pre-quantized models](https://huggingface.co/collections/nvidia/model-optimizer-66aa84f7966b3150262481a4) generated with the [NVIDIA TensorRT Model Optimizer](https://github.com/NVIDIA/TensorRT-Model-Optimizer). +TensorRT LLM can directly run [pre-quantized models](https://huggingface.co/collections/nvidia/model-optimizer-66aa84f7966b3150262481a4) generated with the [NVIDIA Model Optimizer](https://github.com/NVIDIA/Model-Optimizer). ```python from tensorrt_llm import LLM @@ -54,8 +54,8 @@ If a pre-quantized model is not available on the [Hugging Face Hub](https://hugg Follow this step-by-step guide to quantize a model: ```bash -git clone https://github.com/NVIDIA/TensorRT-Model-Optimizer.git -cd TensorRT-Model-Optimizer/examples/llm_ptq +git clone https://github.com/NVIDIA/Model-Optimizer.git +cd Model-Optimizer/examples/llm_ptq scripts/huggingface_example.sh --model --quant fp8 --export_fmt hf ``` @@ -108,4 +108,4 @@ FP8 block wise scaling GEMM kernels for sm100 are using MXFP8 recipe (E4M3 act/w ## Quick Links - [Pre-quantized Models by ModelOpt](https://huggingface.co/collections/nvidia/model-optimizer-66aa84f7966b3150262481a4) -- [ModelOpt Support Matrix](https://nvidia.github.io/TensorRT-Model-Optimizer/guides/0_support_matrix.html) +- [ModelOpt Support Matrix](https://nvidia.github.io/Model-Optimizer/guides/0_support_matrix.html) diff --git a/docs/source/legacy/performance/perf-benchmarking.md b/docs/source/legacy/performance/perf-benchmarking.md index 55caef07ba..5efd6625f0 100644 --- a/docs/source/legacy/performance/perf-benchmarking.md +++ b/docs/source/legacy/performance/perf-benchmarking.md @@ -662,7 +662,7 @@ checkpoint. For the Llama-3.1 models, TensorRT-LLM provides the following checkp - [`nvidia/Llama-3.1-405B-Instruct-FP8`](https://huggingface.co/nvidia/Llama-3.1-405B-Instruct-FP8) `trtllm-bench` utilizes the `hf_quant_config.json` file present in the pre-quantized checkpoints above. The configuration -file is present in checkpoints quantized with [TensorRT Model Optimizer](https://github.com/NVIDIA/TensorRT-Model-Optimizer) +file is present in checkpoints quantized with [Model Optimizer](https://github.com/NVIDIA/Model-Optimizer) and describes the compute and KV cache quantization that checkpoint was compiled with. For example, from the checkpoints above: diff --git a/docs/source/torch/auto_deploy/support_matrix.md b/docs/source/torch/auto_deploy/support_matrix.md index c8780cbca1..f0158253dd 100644 --- a/docs/source/torch/auto_deploy/support_matrix.md +++ b/docs/source/torch/auto_deploy/support_matrix.md @@ -118,7 +118,7 @@ Optimize attention operations with different attention kernel implementations: ### Precision Support -AutoDeploy supports models with various precision formats, including quantized checkpoints generated by [`TensorRT-Model-Optimizer`](https://github.com/NVIDIA/TensorRT-Model-Optimizer). +AutoDeploy supports models with various precision formats, including quantized checkpoints generated by [`Model-Optimizer`](https://github.com/NVIDIA/Model-Optimizer). **Supported precision types include:** diff --git a/docs/source/torch/features/quantization.md b/docs/source/torch/features/quantization.md index a2b6c48be2..47cc745165 100644 --- a/docs/source/torch/features/quantization.md +++ b/docs/source/torch/features/quantization.md @@ -1,7 +1,7 @@ # Quantization The PyTorch backend supports FP8 and NVFP4 quantization. You can pass quantized models in HF model hub, -which are generated by [TensorRT Model Optimizer](https://github.com/NVIDIA/TensorRT-Model-Optimizer). +which are generated by [Model Optimizer](https://github.com/NVIDIA/Model-Optimizer). ```python from tensorrt_llm._torch import LLM @@ -12,7 +12,7 @@ llm.generate("Hello, my name is") Or you can try the following commands to get a quantized model by yourself: ```bash -git clone https://github.com/NVIDIA/TensorRT-Model-Optimizer.git -cd TensorRT-Model-Optimizer/examples/llm_ptq +git clone https://github.com/NVIDIA/Model-Optimizer.git +cd Model-Optimizer/examples/llm_ptq scripts/huggingface_example.sh --model --quant fp8 --export_fmt hf ``` diff --git a/examples/auto_deploy/README.md b/examples/auto_deploy/README.md index c89c1a552c..5343d88999 100644 --- a/examples/auto_deploy/README.md +++ b/examples/auto_deploy/README.md @@ -90,16 +90,16 @@ python lm_eval_ad.py \ --model autodeploy --model_args model=meta-llama/Meta-Llama-3.1-8B-Instruct,world_size=2 --tasks mmlu ``` -### Mixed-precision Quantization using TensorRT Model Optimizer +### Mixed-precision Quantization using Model Optimizer -TensorRT Model Optimizer [AutoQuantize](https://nvidia.github.io/TensorRT-Model-Optimizer/reference/generated/modelopt.torch.quantization.model_quant.html#modelopt.torch.quantization.model_quant.auto_quantize) algorithm is a PTQ algorithm from ModelOpt which quantizes a model by searching for the best quantization format per-layer while meeting the performance constraint specified by the user. This way, `AutoQuantize` enables to trade-off model accuracy for performance. +Model Optimizer [AutoQuantize](https://nvidia.github.io/Model-Optimizer/reference/generated/modelopt.torch.quantization.model_quant.html#modelopt.torch.quantization.model_quant.auto_quantize) algorithm is a PTQ algorithm from ModelOpt which quantizes a model by searching for the best quantization format per-layer while meeting the performance constraint specified by the user. This way, `AutoQuantize` enables to trade-off model accuracy for performance. Currently `AutoQuantize` supports only `effective_bits` as the performance constraint (for both weight-only quantization and weight & activation quantization). See -[AutoQuantize documentation](https://nvidia.github.io/TensorRT-Model-Optimizer/reference/generated/modelopt.torch.quantization.model_quant.html#modelopt.torch.quantization.model_quant.auto_quantize) for more details. +[AutoQuantize documentation](https://nvidia.github.io/Model-Optimizer/reference/generated/modelopt.torch.quantization.model_quant.html#modelopt.torch.quantization.model_quant.auto_quantize) for more details. #### 1. Quantize a model with ModelOpt -Refer to [NVIDIA TensorRT Model Optimizer](https://github.com/NVIDIA/TensorRT-Model-Optimizer/blob/main/examples/llm_autodeploy/README.md) for generating quantized model checkpoint. +Refer to [NVIDIA Model Optimizer](https://github.com/NVIDIA/Model-Optimizer/blob/main/examples/llm_autodeploy/README.md) for generating quantized model checkpoint. #### 2. Deploy the quantized model with AutoDeploy diff --git a/examples/disaggregated/README.md b/examples/disaggregated/README.md index 511bce3619..8b99f8845f 100644 --- a/examples/disaggregated/README.md +++ b/examples/disaggregated/README.md @@ -212,7 +212,7 @@ In disaggregated serving, the context workers and generation workers have differ ### Prerequisites To enable mixed precision serving, you will need: -1. A quantized checkpoint created with [TensorRT Model Optimizer](https://github.com/NVIDIA/TensorRT-Model-Optimizer) +1. A quantized checkpoint created with [Model Optimizer](https://github.com/NVIDIA/Model-Optimizer) 2. The original unquantized checkpoint (Can also be quantized) 3. Both checkpoints must use the same KV cache dtype to ensure compatibility during transfer diff --git a/examples/llm-api/_tensorrt_engine/llm_medusa_decoding.py b/examples/llm-api/_tensorrt_engine/llm_medusa_decoding.py index b6d7f90c0f..f45411b233 100644 --- a/examples/llm-api/_tensorrt_engine/llm_medusa_decoding.py +++ b/examples/llm-api/_tensorrt_engine/llm_medusa_decoding.py @@ -29,7 +29,7 @@ def run_medusa_decoding(use_modelopt_ckpt=False, model_dir=None): llm_kwargs = {} if use_modelopt_ckpt: - # This is a Llama-3.1-8B combined with Medusa heads provided by TensorRT Model Optimizer. + # This is a Llama-3.1-8B combined with Medusa heads provided by Model Optimizer. # Both the base model (except lm_head) and Medusa heads have been quantized in FP8. model = model_dir or "nvidia/Llama-3.1-8B-Medusa-FP8" @@ -85,7 +85,7 @@ if __name__ == '__main__': parser.add_argument( '--use_modelopt_ckpt', action='store_true', - help="Use FP8-quantized checkpoint from TensorRT Model Optimizer.") + help="Use FP8-quantized checkpoint from Model Optimizer.") # TODO: remove this arg after ModelOpt ckpt is public on HF parser.add_argument('--model_dir', type=Path, default=None) args = parser.parse_args() diff --git a/examples/llm-api/_tensorrt_engine/quickstart_example.py b/examples/llm-api/_tensorrt_engine/quickstart_example.py index a6ba9ec559..d02f55c46b 100644 --- a/examples/llm-api/_tensorrt_engine/quickstart_example.py +++ b/examples/llm-api/_tensorrt_engine/quickstart_example.py @@ -9,7 +9,7 @@ def main(): build_config.max_num_tokens = 1024 # Model could accept HF model name, a path to local HF model, - # or TensorRT Model Optimizer's quantized checkpoints like nvidia/Llama-3.1-8B-Instruct-FP8 on HF. + # or Model Optimizer's quantized checkpoints like nvidia/Llama-3.1-8B-Instruct-FP8 on HF. llm = LLM(model="TinyLlama/TinyLlama-1.1B-Chat-v1.0", build_config=build_config) diff --git a/examples/llm-api/llm_inference.py b/examples/llm-api/llm_inference.py index 5146504d25..6c806f0768 100644 --- a/examples/llm-api/llm_inference.py +++ b/examples/llm-api/llm_inference.py @@ -7,7 +7,7 @@ from tensorrt_llm import LLM, SamplingParams def main(): # Model could accept HF model name, a path to local HF model, - # or TensorRT Model Optimizer's quantized checkpoints like nvidia/Llama-3.1-8B-Instruct-FP8 on HF. + # or Model Optimizer's quantized checkpoints like nvidia/Llama-3.1-8B-Instruct-FP8 on HF. llm = LLM(model="TinyLlama/TinyLlama-1.1B-Chat-v1.0") # Sample prompts. diff --git a/examples/llm-api/quickstart_example.py b/examples/llm-api/quickstart_example.py index 400a241c0e..2d6f14012b 100644 --- a/examples/llm-api/quickstart_example.py +++ b/examples/llm-api/quickstart_example.py @@ -4,7 +4,7 @@ from tensorrt_llm import LLM, SamplingParams def main(): # Model could accept HF model name, a path to local HF model, - # or TensorRT Model Optimizer's quantized checkpoints like nvidia/Llama-3.1-8B-Instruct-FP8 on HF. + # or Model Optimizer's quantized checkpoints like nvidia/Llama-3.1-8B-Instruct-FP8 on HF. llm = LLM(model="TinyLlama/TinyLlama-1.1B-Chat-v1.0") # Sample prompts. diff --git a/examples/medusa/README.md b/examples/medusa/README.md index eb442554ec..7820335cd0 100644 --- a/examples/medusa/README.md +++ b/examples/medusa/README.md @@ -19,7 +19,7 @@ For more info about Medusa visit [speculative decoding documentation](https://nv The TensorRT LLM Medusa example code is located in [`examples/medusa`](./). There is one [`convert_checkpoint.py`](./convert_checkpoint.py) file to convert and build the [TensorRT](https://developer.nvidia.com/tensorrt) engine(s) needed to run models with Medusa decoding support. In this example, we demonstrate the usage of two models: 1. The Vucuna 7B model from Hugging Face [`FasterDecoding/medusa-vicuna-7b-v1.3`](https://huggingface.co/FasterDecoding/medusa-vicuna-7b-v1.3) with its Medusa heads [`medusa-vicuna-7b-v1.3`](https://huggingface.co/FasterDecoding/medusa-vicuna-7b-v1.3). -2. The quantized checkpoint [`nvidia/Llama-3.1-8B-Medusa-FP8`](https://huggingface.co/nvidia/Llama-3.1-8B-Medusa-FP8) on Hugging Face by [TensorRT Model Optimizer](https://github.com/NVIDIA/TensorRT-Model-Optimizer) (ModelOpt). This model is based on [Llama-3.1 8B](https://huggingface.co/meta-llama/Llama-3.1-8B) and enhanced with Medusa heads, with both the base model (except lm_head) and Medusa heads already quantized in FP8. +2. The quantized checkpoint [`nvidia/Llama-3.1-8B-Medusa-FP8`](https://huggingface.co/nvidia/Llama-3.1-8B-Medusa-FP8) on Hugging Face by [Model Optimizer](https://github.com/NVIDIA/Model-Optimizer) (ModelOpt). This model is based on [Llama-3.1 8B](https://huggingface.co/meta-llama/Llama-3.1-8B) and enhanced with Medusa heads, with both the base model (except lm_head) and Medusa heads already quantized in FP8. ### Build TensorRT engine(s) Get the weights by downloading base model [`vicuna-7b-v1.3`](https://huggingface.co/lmsys/vicuna-7b-v1.3) and Medusa Heads [`medusa-vicuna-7b-v1.3`](https://huggingface.co/FasterDecoding/medusa-vicuna-7b-v1.3) from HF. diff --git a/examples/models/core/deepseek_v3/README.md b/examples/models/core/deepseek_v3/README.md index 3e82442563..934db2e493 100644 --- a/examples/models/core/deepseek_v3/README.md +++ b/examples/models/core/deepseek_v3/README.md @@ -773,7 +773,7 @@ You can enable FP8 MLA through either of these methods: **Option 1: Checkpoint config** -TensorRT LLM automatically detects the `hf_quant_config.json` file in the model directory, which configures both GEMM and KV cache quantization. For example, see the FP4 DeepSeek-R1 checkpoint [configuration](https://huggingface.co/nvidia/DeepSeek-R1-FP4/blob/main/hf_quant_config.json) provided by [ModelOpt](https://github.com/NVIDIA/TensorRT-Model-Optimizer). +TensorRT LLM automatically detects the `hf_quant_config.json` file in the model directory, which configures both GEMM and KV cache quantization. For example, see the FP4 DeepSeek-R1 checkpoint [configuration](https://huggingface.co/nvidia/DeepSeek-R1-FP4/blob/main/hf_quant_config.json) provided by [ModelOpt](https://github.com/NVIDIA/Model-Optimizer). To enable FP8 MLA, modify the `kv_cache_quant_algo` property. The following shows the config for DeepSeek's block-wise FP8 GEMM quantization + FP8 MLA: @@ -808,14 +808,14 @@ Or you can follow the steps to generate one by yourselves. #### Activation calibration -[ModelOpt](https://github.com/NVIDIA/TensorRT-Model-Optimizer) is used for calibrating activations of MoE layers. We provide a calibrated file at [HF model hub](https://huggingface.co/Barrrrry/DeepSeek-R1-W4AFP8/blob/main/act_scales.safetensors) or you can run the following commands to generate by yourselves. +[ModelOpt](https://github.com/NVIDIA/Model-Optimizer) is used for calibrating activations of MoE layers. We provide a calibrated file at [HF model hub](https://huggingface.co/Barrrrry/DeepSeek-R1-W4AFP8/blob/main/act_scales.safetensors) or you can run the following commands to generate by yourselves. ```bash # Make sure for enough GPU resources (8xH200s) to run the following commands PATH_OF_DEEPSEEK_R1=/llm-models/DeepSeek-R1/DeepSeek-R1 # Install ModelOpt from source -git clone https://github.com/NVIDIA/TensorRT-Model-Optimizer/ && cd modelopt +git clone https://github.com/NVIDIA/Model-Optimizer/ && cd modelopt pip install "nvidia-modelopt[all]" -U --extra-index-url https://pypi.nvidia.com # Clone DeepSeek-V3 (base model of R1) Github repository for FP8 inference, diff --git a/examples/models/core/exaone/README.md b/examples/models/core/exaone/README.md index 549b83843a..9ea4a9e71d 100644 --- a/examples/models/core/exaone/README.md +++ b/examples/models/core/exaone/README.md @@ -85,17 +85,17 @@ The output will be like: #### PyTorch flow Quantization -For PyTorch flow, TRT-LLM supports quantized format generated by [TensorRT Model Optimizer](https://github.com/NVIDIA/TensorRT-Model-Optimizer). +For PyTorch flow, TRT-LLM supports quantized format generated by [Model Optimizer](https://github.com/NVIDIA/Model-Optimizer). You can either do pre-quantized models in HF model hub, or can generate quantized model by yourself and then run models with below command: ```bash -git clone https://github.com/NVIDIA/TensorRT-Model-Optimizer.git -cd TensorRT-Model-Optimizer/examples/llm_ptq +git clone https://github.com/NVIDIA/Model-Optimizer.git +cd Model-Optimizer/examples/llm_ptq scripts/huggingface_example.sh --model hf_models/$MODEL_NAME --quant fp8 --export_fmt hf ``` -For more information, please refer to official [docs](https://github.com/NVIDIA/TensorRT-Model-Optimizer) or [TensorRT Model Optimizer](https://github.com/NVIDIA/TensorRT-Model-Optimizer). +For more information, please refer to official [docs](https://github.com/NVIDIA/Model-Optimizer) or [Model Optimizer](https://github.com/NVIDIA/Model-Optimizer). Troubleshooting @@ -107,7 +107,7 @@ Hint: Move the offending context manager(s) to outside the compiled region. Hint: This graph break may have been caused by an earlier graph break. Resolving the earlier graph break may resolve this one. ``` -This error may indicate an incompatibility between `torch.compile()` and the `HybridCache` module of the transformers library. As a result, [TensorRT Model Optimizer](https://github.com/NVIDIA/TensorRT-Model-Optimizer) (ModelOpt) cannot perform PTQ with HybridCache. +This error may indicate an incompatibility between `torch.compile()` and the `HybridCache` module of the transformers library. As a result, [Model Optimizer](https://github.com/NVIDIA/Model-Optimizer) (ModelOpt) cannot perform PTQ with HybridCache. Temporarily switching to `DynamicCache` when creating PTQ models could help address the issue. This can be done by updating the `cache_implementation` field in the `generation_config.json` file located in the model checkpoint directory, for example: ```json diff --git a/examples/models/core/llama/README.md b/examples/models/core/llama/README.md index 464fe8bdf3..df26ac1ad6 100644 --- a/examples/models/core/llama/README.md +++ b/examples/models/core/llama/README.md @@ -1559,7 +1559,7 @@ Explanation: ### Launch trtllm-serve OpenAI-compatible API server -TensorRT LLM supports nvidia TensorRT Model Optimizer quantized FP8 checkpoint +TensorRT LLM supports nvidia Model Optimizer quantized FP8 checkpoint ``` bash trtllm-serve nvidia/Llama-3.3-70B-Instruct-FP8 \ --tp_size 8 \ diff --git a/examples/models/core/llama4/README.md b/examples/models/core/llama4/README.md index 93e3778864..a6c02070e9 100644 --- a/examples/models/core/llama4/README.md +++ b/examples/models/core/llama4/README.md @@ -42,7 +42,7 @@ Explanation: #### 2. Launch trtllm-serve OpenAI-compatible API server -TensorRT LLM supports nvidia TensorRT Model Optimizer quantized FP8 checkpoint +TensorRT LLM supports nvidia Model Optimizer quantized FP8 checkpoint ``` bash trtllm-serve nvidia/Llama-4-Maverick-17B-128E-Instruct-FP8 \ --max_batch_size 512 \ @@ -94,7 +94,7 @@ Explanation: #### 2. Launch trtllm-serve OpenAI-compatible API server -TensorRT LLM supports nvidia TensorRT Model Optimizer quantized FP8 checkpoint. +TensorRT LLM supports nvidia Model Optimizer quantized FP8 checkpoint. ``` bash trtllm-serve nvidia/Llama-4-Maverick-17B-128E-Instruct-FP8 \ --max_batch_size 8 \ @@ -140,7 +140,7 @@ Explanation: #### 2. Launch trtllm-serve OpenAI-compatible API server -TensorRT LLM supports nvidia TensorRT Model Optimizer quantized FP8 checkpoint. +TensorRT LLM supports nvidia Model Optimizer quantized FP8 checkpoint. ``` bash trtllm-serve nvidia/Llama-4-Maverick-17B-128E-Instruct-FP8 \ --tp_size 8 \ diff --git a/examples/models/core/qwen/README.md b/examples/models/core/qwen/README.md index 52a5ecb481..1d3d97b267 100644 --- a/examples/models/core/qwen/README.md +++ b/examples/models/core/qwen/README.md @@ -663,19 +663,19 @@ trtllm-eval --model=Qwen3-30B-A3B/ --tokenizer=Qwen3-30B-A3B/ --backend=pytorch To quantize the Qwen3 model for use with the PyTorch backend, we'll use NVIDIA's Model Optimizer (ModelOpt) tool. Follow these steps: ```bash -# Clone the TensorRT Model Optimizer (ModelOpt) -git clone https://github.com/NVIDIA/TensorRT-Model-Optimizer.git -pushd TensorRT-Model-Optimizer +# Clone the Model Optimizer (ModelOpt) +git clone https://github.com/NVIDIA/Model-Optimizer.git +pushd Model-Optimizer # install the ModelOpt pip install -e . # Quantize the Qwen3-235B-A22B model by nvfp4 -# By default, the checkpoint would be stored in `TensorRT-Model-Optimizer/examples/llm_ptq/saved_models_Qwen3-235B-A22B_nvfp4_hf/`. +# By default, the checkpoint would be stored in `Model-Optimizer/examples/llm_ptq/saved_models_Qwen3-235B-A22B_nvfp4_hf/`. ./examples/llm_ptq/scripts/huggingface_example.sh --model Qwen3-235B-A22B/ --quant nvfp4 --export_fmt hf # Quantize the Qwen3-32B model by fp8_pc_pt -# By default, the checkpoint would be stored in `TensorRT-Model-Optimizer/examples/llm_ptq/saved_models_Qwen3-32B_fp8_pc_pt_hf/`. +# By default, the checkpoint would be stored in `Model-Optimizer/examples/llm_ptq/saved_models_Qwen3-32B_fp8_pc_pt_hf/`. ./examples/llm_ptq/scripts/huggingface_example.sh --model Qwen3-32B/ --quant fp8_pc_pt --export_fmt hf popd ``` @@ -687,7 +687,7 @@ To run the benchmark, we suggest using the `trtllm-bench` tool. Please refer to ```bash #!/bin/bash -folder_model=TensorRT-Model-Optimizer/examples/llm_ptq/saved_models_Qwen3-235B-A22B_nvfp4_hf/ +folder_model=Model-Optimizer/examples/llm_ptq/saved_models_Qwen3-235B-A22B_nvfp4_hf/ path_config=extra-llm-api-config.yml num_gpus=8 ep_size=8 @@ -727,7 +727,7 @@ trtllm-bench --model ${folder_model} --model_path ${folder_model} throughput \ We suggest benchmarking with a real dataset. It will prevent from having improperly distributed tokens in the MoE. Here, we use the `aa_prompt_isl_1k_osl_2k_qwen3_10000samples.txt` dataset. It has 10000 samples with an average input length of 1024 and an average output length of 2048. If you don't have a dataset (this or an other) and you want to run the benchmark, you can use the following command to generate a random dataset: ```bash -folder_model=TensorRT-Model-Optimizer/examples/llm_ptq/saved_models_Qwen3-235B-A22B_nvfp4_hf/ +folder_model=Model-Optimizer/examples/llm_ptq/saved_models_Qwen3-235B-A22B_nvfp4_hf/ min_input_len=1024 min_output_len=2048 concurrency=128 diff --git a/examples/quantization/README.md b/examples/quantization/README.md index e74736b61b..b3b2e35b20 100644 --- a/examples/quantization/README.md +++ b/examples/quantization/README.md @@ -11,7 +11,7 @@ The detailed LLM quantization recipe is distributed to the README.md of the corr ## Installation -The NVIDIA TensorRT Model Optimizer quantization toolkit is installed automatically as a dependency of TensorRT-LLM. +The NVIDIA Model Optimizer quantization toolkit is installed automatically as a dependency of TensorRT-LLM. ```bash # Install the additional requirements diff --git a/security_scanning/examples/models/core/mllama/poetry.lock b/security_scanning/examples/models/core/mllama/poetry.lock index 11e0ed3ccb..c58e7c12b5 100644 --- a/security_scanning/examples/models/core/mllama/poetry.lock +++ b/security_scanning/examples/models/core/mllama/poetry.lock @@ -708,7 +708,7 @@ files = [ [[package]] name = "nvidia-modelopt" version = "0.21.1" -description = "Nvidia TensorRT Model Optimizer: a unified model optimization and deployment toolkit." +description = "Nvidia Model Optimizer: a unified model optimization and deployment toolkit." optional = false python-versions = "<3.13,>=3.8" files = [ diff --git a/security_scanning/poetry.lock b/security_scanning/poetry.lock index 18ed93657e..e5959abf84 100644 --- a/security_scanning/poetry.lock +++ b/security_scanning/poetry.lock @@ -2793,7 +2793,7 @@ files = [ [[package]] name = "nvidia-modelopt" version = "0.37.0" -description = "Nvidia TensorRT Model Optimizer: a unified model optimization and deployment toolkit." +description = "Nvidia Model Optimizer: a unified model optimization and deployment toolkit." optional = false python-versions = "<3.13,>=3.10" files = [ From 41ce14ab0445cb35d4b7d3ac715dffd0a2ae03fb Mon Sep 17 00:00:00 2001 From: Ludwig Schneider Date: Sun, 7 Dec 2025 11:43:26 -0600 Subject: [PATCH 04/10] [None][feat] Enable NCCL_SYMMETRIC as default fallback for AllReduce (#9314) Signed-off-by: Ludwig Schneider --- .../common/customAllReduceUtils.h | 5 +- cpp/tensorrt_llm/common/ncclUtils.cpp | 585 ++++++++++++++ cpp/tensorrt_llm/common/ncclUtils.h | 397 ++++++++++ cpp/tensorrt_llm/common/opUtils.cpp | 25 +- .../kernels/userbuffers/ub_allocator.cpp | 175 +--- .../kernels/userbuffers/ub_allocator.h | 56 -- .../userbuffers/userbuffersManager.cpp | 15 +- .../kernels/userbuffers/userbuffersManager.h | 9 +- .../plugins/ncclPlugin/allreducePlugin.cpp | 62 +- cpp/tensorrt_llm/thop/allreduceOp.cpp | 471 ++++++++--- cpp/tests/unit_tests/multi_gpu/CMakeLists.txt | 6 + .../unit_tests/multi_gpu/ncclUtilsTest.cpp | 745 ++++++++++++++++++ .../_torch/pyexecutor/model_engine.py | 16 +- tensorrt_llm/functional.py | 7 +- tests/integration/defs/cpp/test_multi_gpu.py | 27 + tests/microbenchmarks/all_reduce.py | 4 + .../allreduce_heuristic_code_gen.py | 5 +- .../_torch/multi_gpu/test_allreduce.py | 2 +- .../_torch/multi_gpu/test_mnnvl_allreduce.py | 2 +- .../_torch/multi_gpu/test_user_buffers.py | 3 +- 20 files changed, 2225 insertions(+), 392 deletions(-) create mode 100644 cpp/tensorrt_llm/common/ncclUtils.cpp create mode 100644 cpp/tensorrt_llm/common/ncclUtils.h create mode 100644 cpp/tests/unit_tests/multi_gpu/ncclUtilsTest.cpp diff --git a/cpp/tensorrt_llm/common/customAllReduceUtils.h b/cpp/tensorrt_llm/common/customAllReduceUtils.h index 0a6c2d9d32..9a466512e4 100644 --- a/cpp/tensorrt_llm/common/customAllReduceUtils.h +++ b/cpp/tensorrt_llm/common/customAllReduceUtils.h @@ -81,7 +81,6 @@ inline AllReduceStrategyType SelectStrategyLP(size_t seq_len, size_t hidden_size { return AllReduceStrategyType::ONESHOT; } - return AllReduceStrategyType::NCCL; } // use 1D vector to store the best strategy instead of a map for each sm version @@ -143,7 +142,7 @@ inline AllReduceStrategyType selectStrategyLookUpTable( sm_version = 100; } - // Check if the entry is out of bounds, otherwise return NCCL as fallback + // Check if the entry is out of bounds, otherwise return NCCL_SYMMETRIC as fallback if (AllReduceBestStrategyTable.find(sm_version) == AllReduceBestStrategyTable.end() || tp_index >= AllReduceBestStrategyTable.at(sm_version).size() || fusion_op_index >= AllReduceBestStrategyTable.at(sm_version).at(tp_index).size() @@ -151,7 +150,7 @@ inline AllReduceStrategyType selectStrategyLookUpTable( || num_token_index >= AllReduceBestStrategyTable.at(sm_version).at(tp_index).at(fusion_op_index).at(hidden_size_index).size()) { - return AllReduceStrategyType::NCCL; + return AllReduceStrategyType::NCCL_SYMMETRIC; } return static_cast( diff --git a/cpp/tensorrt_llm/common/ncclUtils.cpp b/cpp/tensorrt_llm/common/ncclUtils.cpp new file mode 100644 index 0000000000..76406fd806 --- /dev/null +++ b/cpp/tensorrt_llm/common/ncclUtils.cpp @@ -0,0 +1,585 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "tensorrt_llm/common/ncclUtils.h" + +#if ENABLE_MULTI_DEVICE + +#include "tensorrt_llm/common/assert.h" +#include "tensorrt_llm/common/cudaUtils.h" +#include "tensorrt_llm/common/logger.h" +#include +#include + +namespace tensorrt_llm::common::nccl_util +{ + +//============================================================================== +// NcclCommResourceManager Implementation +//============================================================================== + +NcclCommResourceManager& NcclCommResourceManager::getInstance() noexcept +{ + static NcclCommResourceManager instance; + return instance; +} + +void NcclCommResourceManager::registerResource(ncclComm_t comm, ResourceCleanupFunc cleanup, char const* debugName) +{ + if (!comm) + { + TLLM_LOG_WARNING("[NCCLUtil] Attempted to register resource for null NCCL comm"); + return; + } + + std::lock_guard lock(mMutex); + auto& resources = mCommResources[comm]; + resources.emplace_back(std::move(cleanup), debugName ? debugName : "unnamed"); + + TLLM_LOG_TRACE("[NCCLUtil] Registered resource '%s' for NCCL comm %p (total: %zu)", + debugName ? debugName : "unnamed", static_cast(comm), resources.size()); +} + +void NcclCommResourceManager::cleanupResources(ncclComm_t comm) noexcept +{ + if (!comm) + { + return; + } + + std::vector resourcesToClean; + + { + std::lock_guard lock(mMutex); + auto it = mCommResources.find(comm); + if (it == mCommResources.end()) + { + // Nothing registered for this comm, nothing to clean up + return; + } + + // Move resources out (preserves order) and remove from map + resourcesToClean = std::move(it->second); + mCommResources.erase(it); + + TLLM_LOG_TRACE( + "[NCCLUtil] Cleaning up %zu resources for NCCL comm %p", resourcesToClean.size(), static_cast(comm)); + } + + // Clean up outside the lock to avoid deadlocks if cleanup functions try to access the manager + // Order is preserved: resources are cleaned up in registration order + for (auto& [cleanup, name] : resourcesToClean) + { + try + { + TLLM_LOG_TRACE( + "[NCCLUtil] Cleaning up resource '%s' for NCCL comm %p", name.c_str(), static_cast(comm)); + cleanup(); + } + catch (std::exception const& e) + { + TLLM_LOG_ERROR("[NCCLUtil] Exception during cleanup of resource '%s' for NCCL comm %p: %s", name.c_str(), + static_cast(comm), e.what()); + } + catch (...) + { + TLLM_LOG_ERROR("[NCCLUtil] Unknown exception during cleanup of resource '%s' for NCCL comm %p", + name.c_str(), static_cast(comm)); + } + } +} + +bool NcclCommResourceManager::hasResources(ncclComm_t comm) const noexcept +{ + std::lock_guard lock(mMutex); + return mCommResources.find(comm) != mCommResources.end(); +} + +size_t NcclCommResourceManager::getResourceCount(ncclComm_t comm) const noexcept +{ + std::lock_guard lock(mMutex); + auto it = mCommResources.find(comm); + return it != mCommResources.end() ? it->second.size() : 0; +} + +//============================================================================== +// NCCLHelper Implementation +//============================================================================== + +NCCLHelper& NCCLHelper::getInstance() +{ + static NCCLHelper instance; + return instance; +} + +NCCLHelper::NCCLHelper() + : mLibraryHandle(nullptr) + , mNCCLCommWindowRegister(nullptr) + , mNCCLMemAlloc(nullptr) + , mIsLoaded(false) +{ + loadNCCLLibrary(); +} + +NCCLHelper::~NCCLHelper() +{ + if (mLibraryHandle) + { +#ifdef _WIN32 + FreeLibrary(mLibraryHandle); +#else + dlclose(mLibraryHandle); +#endif + mLibraryHandle = nullptr; + } +} + +void NCCLHelper::loadNCCLLibrary() +{ + try + { +#ifdef _WIN32 + char const* libraryNames[] = {"nccl.dll"}; +#else + char const* libraryNames[] = {"libnccl.so"}; +#endif + + for (auto const* name : libraryNames) + { + mLibraryHandle = loadLibraryHandle(name); + if (mLibraryHandle) + { + TLLM_LOG_INFO("Successfully loaded NCCL library: %s", name); + break; + } + } + + if (!mLibraryHandle) + { + TLLM_LOG_WARNING("Failed to load NCCL library"); + return; + } + + // Load the required symbols + mNCCLCommWindowRegister + = reinterpret_cast(getSymbolAddress(mLibraryHandle, "ncclCommWindowRegister")); + + mNCCLMemAlloc = reinterpret_cast(getSymbolAddress(mLibraryHandle, "ncclMemAlloc")); + + if (mNCCLCommWindowRegister == nullptr) + { + TLLM_LOG_WARNING("Failed to load ncclCommWindowRegister symbol, NCCL symmetric will not be supported."); + } + + if (mNCCLMemAlloc == nullptr) + { + TLLM_LOG_WARNING("Failed to load ncclMemAlloc symbol, NCCL symmetric will not be supported."); + } + + if (mNCCLCommWindowRegister != nullptr && mNCCLMemAlloc != nullptr) + { + mIsLoaded = true; + } + else + { + TLLM_LOG_WARNING( + "Failed to load required NCCL symbols (both ncclCommWindowRegister and ncclMemAlloc are required)"); + } + } + catch (std::exception const& e) + { + TLLM_LOG_WARNING("Exception while loading NCCL library: %s", e.what()); + } +} + +void* NCCLHelper::loadLibraryHandle(char const* libName) +{ +#ifdef _WIN32 + return LoadLibraryA(libName); +#else + return dlopen(libName, RTLD_LAZY | RTLD_GLOBAL); +#endif +} + +void* NCCLHelper::getSymbolAddress(void* handle, char const* symbolName) +{ + if (!handle) + { + return nullptr; + } + +#ifdef _WIN32 + return GetProcAddress(static_cast(handle), symbolName); +#else + return dlsym(handle, symbolName); +#endif +} + +NCCLHelper::ncclCommWindowRegisterFunc NCCLHelper::getNCCLCommWindowRegister() +{ + return mNCCLCommWindowRegister; +} + +NCCLHelper::ncclMemAllocFunc NCCLHelper::getNCCLMemAlloc() +{ + return mNCCLMemAlloc; +} + +bool NCCLHelper::isLoaded() const +{ + return mIsLoaded; +} + +//============================================================================== +// NCCLWindowAllocator Implementation +//============================================================================== + +NCCLWindowAllocator& NCCLWindowAllocator::getInstance() +{ + static NCCLWindowAllocator instance; + return instance; +} + +NCCLWindowBuffer NCCLWindowAllocator::requestBuffer(ncclComm_t comm, size_t size) +{ + TLLM_CHECK_WITH_INFO(comm != nullptr, "NCCL communicator cannot be null"); + TLLM_CHECK_WITH_INFO(size > 0, "Buffer size must be greater than 0"); + + std::lock_guard lock(mMutex); + + // Register cleanup callback for this communicator if not already registered + // This is cheap even if no buffers exist yet - cleanup will just return early + registerBufferCleanup(comm); + + // Check if we have an available buffer of at least the requested size for this communicator + // Use best-fit: find the smallest buffer that's >= requested size + auto& commBuffers = mBufferPool[comm]; + auto bestFit = commBuffers.end(); + size_t bestFitSize = std::numeric_limits::max(); + + for (auto it = commBuffers.begin(); it != commBuffers.end(); ++it) + { + if (!it->inUse && it->buffer.size >= size && it->buffer.size < bestFitSize) + { + bestFit = it; + bestFitSize = it->buffer.size; + } + } + + if (bestFit != commBuffers.end()) + { + bestFit->inUse = true; + TLLM_LOG_TRACE( + "[NCCLUtil] Reusing NCCL window buffer for comm %p: handle=%d, ptr=%p, size=%zu (requested: %zu)", + static_cast(comm), bestFit->buffer.handle, bestFit->buffer.ptr, bestFit->buffer.size, size); + return bestFit->buffer; + } + + // No available buffer found, allocate a new one + TLLM_LOG_TRACE( + "[NCCLUtil] Allocating new NCCL window buffer for comm %p, size=%zu", static_cast(comm), size); + int handle = static_cast(commBuffers.size()); + NCCLWindowBuffer buffer = allocateAndRegisterBuffer(comm, size, handle); + commBuffers.push_back({buffer, true}); + + return buffer; +} + +NCCLWindowBuffer NCCLWindowAllocator::searchBuffer(ncclComm_t comm, void* ptr) const +{ + if (!comm || !ptr) + { + return NCCLWindowBuffer(); + } + + std::lock_guard lock(mMutex); + return searchBufferLocked(comm, ptr); +} + +void NCCLWindowAllocator::releaseBuffer(ncclComm_t comm, void* ptr) +{ + if (!comm || !ptr) + { + return; + } + + std::lock_guard lock(mMutex); + auto commIt = mBufferPool.find(comm); + if (commIt == mBufferPool.end()) + { + TLLM_LOG_WARNING( + "[NCCLUtil] Attempted to release buffer %p for unknown comm %p", ptr, static_cast(comm)); + return; + } + + for (auto& entry : commIt->second) + { + if (entry.buffer.ptr == ptr) + { + entry.inUse = false; + TLLM_LOG_TRACE("[NCCLUtil] Released NCCL window buffer for comm %p: ptr=%p", static_cast(comm), ptr); + return; + } + } + + TLLM_LOG_WARNING("[NCCLUtil] Attempted to release unknown buffer %p for comm %p", ptr, static_cast(comm)); +} + +ncclWindow_t NCCLWindowAllocator::getWindow(ncclComm_t comm, void* ptr) const +{ + std::lock_guard lock(mMutex); + NCCLWindowBuffer buffer = searchBufferLocked(comm, ptr); + return buffer.isValid() ? buffer.window : nullptr; +} + +size_t NCCLWindowAllocator::getSize(ncclComm_t comm, void* ptr) const +{ + std::lock_guard lock(mMutex); + NCCLWindowBuffer buffer = searchBufferLocked(comm, ptr); + return buffer.isValid() ? buffer.size : 0; +} + +NCCLWindowBuffer NCCLWindowAllocator::getBufferInfo(ncclComm_t comm, void* ptr) const +{ + std::lock_guard lock(mMutex); + return searchBufferLocked(comm, ptr); +} + +size_t NCCLWindowAllocator::getBufferCount(ncclComm_t comm) const +{ + std::lock_guard lock(mMutex); + auto commIt = mBufferPool.find(comm); + return commIt != mBufferPool.end() ? commIt->second.size() : 0; +} + +size_t NCCLWindowAllocator::getBufferInUseCount(ncclComm_t comm) const +{ + std::lock_guard lock(mMutex); + auto commIt = mBufferPool.find(comm); + if (commIt == mBufferPool.end()) + { + return 0; + } + + size_t count = 0; + for (auto const& entry : commIt->second) + { + if (entry.inUse) + { + ++count; + } + } + return count; +} + +bool NCCLWindowAllocator::isCommValid(ncclComm_t comm) const noexcept +{ + // Simply check for null - all non-null comms are valid + // We don't track cleaned-up comms because NCCL can reuse memory addresses, + // making pointer-based tracking unreliable. New comms will be registered when used. + return comm != nullptr; +} + +NCCLWindowBuffer NCCLWindowAllocator::allocateAndRegisterBuffer(ncclComm_t comm, size_t size, int handle) +{ + NCCLWindowBuffer buffer; + buffer.handle = handle; + + // Get NCCL helper for dynamic symbol loading + auto& ncclHelper = NCCLHelper::getInstance(); + if (!ncclHelper.isLoaded()) + { + TLLM_THROW("NCCL library could not be loaded for dynamic symbol access"); + } + + auto ncclMemAllocFunc = ncclHelper.getNCCLMemAlloc(); + auto ncclCommWindowRegisterFunc = ncclHelper.getNCCLCommWindowRegister(); + + // Defensive checks: both function pointers must be non-null + if (ncclMemAllocFunc == nullptr) + { + TLLM_THROW("ncclMemAlloc function pointer is null, cannot allocate NCCL window buffer"); + } + + if (ncclCommWindowRegisterFunc == nullptr) + { + TLLM_THROW("ncclCommWindowRegister function pointer is null, cannot register NCCL window buffer"); + } + + // Allocate device memory using ncclMemAlloc + ncclResult_t allocResult = ncclMemAllocFunc(&buffer.ptr, size); + if (allocResult != ncclSuccess) + { + TLLM_THROW("ncclMemAlloc failed with error: %d", allocResult); + } + buffer.size = size; + + // Register the buffer with NCCL as a window + ncclResult_t regResult + = ncclCommWindowRegisterFunc(comm, buffer.ptr, size, &buffer.window, NCCL_WIN_COLL_SYMMETRIC); + if (regResult != ncclSuccess) + { + ncclMemFree(buffer.ptr); + TLLM_THROW("ncclCommWindowRegister failed with error: %d", regResult); + } + + TLLM_LOG_TRACE("[NCCLUtil] Allocated and registered NCCL window buffer: handle=%d, ptr=%p, size=%zu, window=%p", + handle, buffer.ptr, size, static_cast(buffer.window)); + + return buffer; +} + +NCCLWindowBuffer NCCLWindowAllocator::searchBufferLocked(ncclComm_t comm, void* ptr) const +{ + auto commIt = mBufferPool.find(comm); + if (commIt == mBufferPool.end()) + { + return NCCLWindowBuffer(); + } + + for (auto const& entry : commIt->second) + { + if (entry.buffer.ptr == ptr) + { + return entry.buffer; + } + } + + return NCCLWindowBuffer(); +} + +void NCCLWindowAllocator::registerBufferCleanup(ncclComm_t comm) +{ + // Don't register if already registered + if (mRegisteredComms.find(comm) != mRegisteredComms.end()) + { + return; + } + + mRegisteredComms.insert(comm); + + // Register cleanup with the resource manager + NcclCommResourceManager::getInstance().registerResource( + comm, [this, comm]() { this->cleanupBuffersForComm(comm); }, "NCCLWindowAllocator"); +} + +void NCCLWindowAllocator::cleanupBuffersForComm(ncclComm_t comm) noexcept +{ + if (!comm) + { + return; + } + + // Synchronize CUDA to ensure all operations using these buffers are complete + // before we deregister windows and free memory + cudaError_t cudaErr = cudaDeviceSynchronize(); + if (cudaErr != cudaSuccess) + { + TLLM_LOG_WARNING("[NCCLUtil] cudaDeviceSynchronize failed with error: %d before cleanup for comm %p", cudaErr, + static_cast(comm)); + // Continue anyway - the sync failure might be from a previous error + } + + std::lock_guard lock(mMutex); + + // Check if we've already cleaned up this communicator + if (mRegisteredComms.find(comm) == mRegisteredComms.end()) + { + // Already cleaned up or never registered + return; + } + + auto commIt = mBufferPool.find(comm); + if (commIt == mBufferPool.end()) + { + // No buffers to clean up, but mark as cleaned + mRegisteredComms.erase(comm); + return; + } + + TLLM_LOG_TRACE( + "[NCCLUtil] Cleaning up %zu NCCL window buffers for comm %p", commIt->second.size(), static_cast(comm)); + + // Check for buffers still in use - this shouldn't happen if cleanup is called properly, + // but we log a warning if it does + size_t inUseCount = 0; + for (auto const& entry : commIt->second) + { + if (entry.inUse) + { + ++inUseCount; + } + } + if (inUseCount > 0) + { + TLLM_LOG_WARNING( + "[NCCLUtil] Cleaning up %zu buffers still marked as in-use for comm %p. " + "This may indicate buffers weren't properly released before cleanup.", + inUseCount, static_cast(comm)); + } + + for (auto& entry : commIt->second) + { + if (entry.buffer.isValid()) + { + // Deregister the window - the communicator is still valid at this point + // (cleanup happens before ncclCommDestroy), but we need to be careful + // if buffers are still in use by active operations + if (entry.buffer.window && comm) + { + // Note: Even if buffer is marked inUse, we must deregister since + // the communicator is being destroyed. The communicator is valid, + // but we should handle potential errors gracefully. + ncclResult_t result = ncclCommWindowDeregister(comm, entry.buffer.window); + if (result != ncclSuccess) + { + TLLM_LOG_WARNING( + "[NCCLUtil] ncclCommWindowDeregister failed with error: %d for comm %p, " + "window %p (buffer inUse: %d)", + result, static_cast(comm), static_cast(entry.buffer.window), entry.inUse); + } + } + + // Free device memory using ncclMemFree + // This should be safe even if deregister failed + if (entry.buffer.ptr) + { + try + { + ncclResult_t ncclResult = ncclMemFree(entry.buffer.ptr); + if (ncclResult != ncclSuccess) + { + TLLM_LOG_WARNING("[NCCLUtil] ncclMemFree failed with error: %d", ncclResult); + } + } + catch (...) + { + TLLM_LOG_ERROR("[NCCLUtil] Exception during ncclMemFree for ptr %p", entry.buffer.ptr); + } + } + + TLLM_LOG_TRACE( + "[NCCLUtil] Freed NCCL window buffer: ptr=%p, size=%zu", entry.buffer.ptr, entry.buffer.size); + } + } + + mBufferPool.erase(commIt); + mRegisteredComms.erase(comm); +} + +} // namespace tensorrt_llm::common::nccl_util + +#endif // ENABLE_MULTI_DEVICE diff --git a/cpp/tensorrt_llm/common/ncclUtils.h b/cpp/tensorrt_llm/common/ncclUtils.h new file mode 100644 index 0000000000..d128741e0a --- /dev/null +++ b/cpp/tensorrt_llm/common/ncclUtils.h @@ -0,0 +1,397 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include "tensorrt_llm/common/assert.h" +#include "tensorrt_llm/common/cudaUtils.h" +#include "tensorrt_llm/common/logger.h" + +#if ENABLE_MULTI_DEVICE +#include +#include +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#if ENABLE_MULTI_DEVICE + +#ifdef _WIN32 +#include +#else +#include +#endif + +namespace tensorrt_llm::common::nccl_util +{ + +//============================================================================== +// NCCL Helper - Dynamic Library Loading +//============================================================================== + +// Helper class for dynamically loading NCCL symbols (ncclMemAlloc, ncclCommWindowRegister) +// This allows the code to work with NCCL libraries that may or may not have these symbols +class NCCLHelper +{ +public: + static NCCLHelper& getInstance(); + + // Dynamic loading function type definition + using ncclCommWindowRegisterFunc = ncclResult_t (*)(ncclComm_t, void*, size_t, ncclWindow_t*, int); + using ncclMemAllocFunc = ncclResult_t (*)(void**, size_t); + + // Get function pointer for ncclCommWindowRegister + ncclCommWindowRegisterFunc getNCCLCommWindowRegister(); + + // Get function pointer for ncclMemAlloc + ncclMemAllocFunc getNCCLMemAlloc(); + + // Check if NCCL library is successfully loaded + bool isLoaded() const; + + NCCLHelper(NCCLHelper const&) = delete; + NCCLHelper& operator=(NCCLHelper const&) = delete; + NCCLHelper(NCCLHelper&&) = delete; + NCCLHelper& operator=(NCCLHelper&&) = delete; + +private: + NCCLHelper(); + ~NCCLHelper(); + + void loadNCCLLibrary(); + void* loadLibraryHandle(char const* libName); + void* getSymbolAddress(void* handle, char const* symbolName); + +#ifdef _WIN32 + HMODULE mLibraryHandle; +#else + void* mLibraryHandle; +#endif + + ncclCommWindowRegisterFunc mNCCLCommWindowRegister; + ncclMemAllocFunc mNCCLMemAlloc; + bool mIsLoaded; +}; + +//============================================================================== +// NCCL Resource Management +//============================================================================== + +// Resource cleanup function type. Called before the NCCL communicator is destroyed. +using ResourceCleanupFunc = std::function; + +// Manages resources associated with NCCL communicators. Thread-safe singleton that maintains +// a pool of resources per NCCL comm. Resources are automatically cleaned up when the +// communicator is destroyed. +class NcclCommResourceManager +{ +public: + static NcclCommResourceManager& getInstance() noexcept; + + // Register a resource cleanup function for a specific NCCL communicator. + // The cleanup function will be called before ncclCommDestroy. + // Thread-safe: Uses global mutex to serialize all operations. + void registerResource(ncclComm_t comm, ResourceCleanupFunc cleanup, char const* debugName = nullptr); + + // Cleanup all resources associated with a communicator. Called automatically by + // the shared_ptr deleter before ncclCommDestroy. + // Thread-safe: Uses global mutex to serialize cleanup operations. + // Order-preserving: Resources are cleaned up in registration order. + void cleanupResources(ncclComm_t comm) noexcept; + + // Check if a communicator has registered resources. + bool hasResources(ncclComm_t comm) const noexcept; + + // Get the number of resources registered for a communicator. + size_t getResourceCount(ncclComm_t comm) const noexcept; + + NcclCommResourceManager(NcclCommResourceManager const&) = delete; + NcclCommResourceManager& operator=(NcclCommResourceManager const&) = delete; + NcclCommResourceManager(NcclCommResourceManager&&) = delete; + NcclCommResourceManager& operator=(NcclCommResourceManager&&) = delete; + +private: + NcclCommResourceManager() = default; + ~NcclCommResourceManager() = default; + + using ResourceEntry = std::pair; + + mutable std::mutex mMutex; + std::unordered_map> mCommResources; +}; + +// RAII helper to register a resource with a NCCL communicator. +// Automatically registers cleanup function on construction. +template +class NcclCommResource +{ +public: + NcclCommResource(ncclComm_t comm, ResourceType&& resource, std::function cleanup, + char const* debugName = nullptr) + : mComm(comm) + , mResource(std::forward(resource)) + , mCleanup(std::move(cleanup)) + , mRegistered(true) + { + // Register with the manager + NcclCommResourceManager::getInstance().registerResource( + comm, + [this]() + { + if (mCleanup) + { + mCleanup(mResource); + } + }, + debugName); + } + + ResourceType& get() + { + return mResource; + } + + ResourceType const& get() const + { + return mResource; + } + + NcclCommResource(NcclCommResource const&) = delete; + NcclCommResource& operator=(NcclCommResource const&) = delete; + NcclCommResource(NcclCommResource&&) = delete; + NcclCommResource& operator=(NcclCommResource&&) = delete; + +private: + ncclComm_t mComm; + ResourceType mResource; + std::function mCleanup; + bool mRegistered; +}; + +//============================================================================== +// NCCL Window Buffer Allocation +//============================================================================== + +// Represents a buffer with an associated NCCL window +struct NCCLWindowBuffer +{ + void* ptr; // Device pointer (same as UBBuffer.addr) + int handle; // Buffer handle/index (for compatibility with UB interface) + size_t size; // Size in bytes + ncclWindow_t window; // NCCL window handle + + NCCLWindowBuffer(void* p = nullptr, int h = -1, size_t s = 0, ncclWindow_t w = nullptr) + : ptr(p) + , handle(h) + , size(s) + , window(w) + { + } + + [[nodiscard]] bool isValid() const + { + return ptr != nullptr && handle >= 0 && size > 0 && window != nullptr; + } + + [[nodiscard]] bool invalid() const + { + return !isValid(); + } + + // Alias for compatibility with UBBuffer interface + void* addr() const + { + return ptr; + } +}; + +// Manages NCCL window-registered buffers with pooling and automatic cleanup. +// Buffers are tied to the lifetime of their associated NCCL communicator. +class NCCLWindowAllocator +{ +public: + static NCCLWindowAllocator& getInstance(); + + // Request a buffer for the given communicator and size. + // If an unused buffer of at least the requested size exists for this communicator, it will be reused. + // Uses best-fit strategy: selects the smallest available buffer that meets the size requirement. + // Otherwise, a new buffer is allocated and registered. + NCCLWindowBuffer requestBuffer(ncclComm_t comm, size_t size); + + // Search for a buffer by pointer. Returns an invalid buffer if not found. + // This matches the UBManager.search_buffer() interface. + NCCLWindowBuffer searchBuffer(ncclComm_t comm, void* ptr) const; + + // Release a buffer back to the pool for potential reuse + void releaseBuffer(ncclComm_t comm, void* ptr); + + // Get the window handle for a specific buffer pointer + ncclWindow_t getWindow(ncclComm_t comm, void* ptr) const; + + // Get the size of a specific buffer pointer + size_t getSize(ncclComm_t comm, void* ptr) const; + + // Get buffer info by pointer + NCCLWindowBuffer getBufferInfo(ncclComm_t comm, void* ptr) const; + + // Get the number of buffers allocated for a communicator + size_t getBufferCount(ncclComm_t comm) const; + + // Get the number of buffers in use for a communicator + size_t getBufferInUseCount(ncclComm_t comm) const; + + // Check if a communicator is valid (non-null) + // Note: We don't track cleaned-up comms because NCCL can reuse memory addresses. + // All non-null comms are considered valid and will be registered when first used. + bool isCommValid(ncclComm_t comm) const noexcept; + + NCCLWindowAllocator(NCCLWindowAllocator const&) = delete; + NCCLWindowAllocator& operator=(NCCLWindowAllocator const&) = delete; + NCCLWindowAllocator(NCCLWindowAllocator&&) = delete; + NCCLWindowAllocator& operator=(NCCLWindowAllocator&&) = delete; + +private: + NCCLWindowAllocator() = default; + ~NCCLWindowAllocator() = default; + + // Allocate a new buffer and register it with NCCL as a window + NCCLWindowBuffer allocateAndRegisterBuffer(ncclComm_t comm, size_t size, int handle); + + // Search for a buffer by pointer (assumes mMutex is already locked) + NCCLWindowBuffer searchBufferLocked(ncclComm_t comm, void* ptr) const; + + // Register cleanup function for all buffers associated with a communicator + void registerBufferCleanup(ncclComm_t comm); + + // Cleanup all buffers for a specific communicator + void cleanupBuffersForComm(ncclComm_t comm) noexcept; + + struct BufferEntry + { + NCCLWindowBuffer buffer; + bool inUse; + }; + + mutable std::mutex mMutex; + std::unordered_map> mBufferPool; + std::unordered_set mRegisteredComms; +}; + +// RAII wrapper for NCCL window buffers +class ScopedNCCLWindowBuffer +{ +public: + ScopedNCCLWindowBuffer(ncclComm_t comm, size_t size) + : mComm(comm) + , mBuffer(NCCLWindowAllocator::getInstance().requestBuffer(comm, size)) + { + } + + ~ScopedNCCLWindowBuffer() + { + if (mBuffer.isValid()) + { + NCCLWindowAllocator::getInstance().releaseBuffer(mComm, mBuffer.ptr); + } + } + + void* getPtr() const + { + return mBuffer.ptr; + } + + size_t getSize() const + { + return mBuffer.size; + } + + ncclWindow_t getWindow() const + { + return mBuffer.window; + } + + NCCLWindowBuffer const& getBuffer() const + { + return mBuffer; + } + + ScopedNCCLWindowBuffer(ScopedNCCLWindowBuffer const&) = delete; + ScopedNCCLWindowBuffer& operator=(ScopedNCCLWindowBuffer const&) = delete; + ScopedNCCLWindowBuffer(ScopedNCCLWindowBuffer&&) = delete; + ScopedNCCLWindowBuffer& operator=(ScopedNCCLWindowBuffer&&) = delete; + +private: + ncclComm_t mComm; + NCCLWindowBuffer mBuffer; +}; + +// Creates a PyTorch tensor backed by an NCCL window buffer. +// The tensor will automatically release the buffer back to the pool when destroyed. +// This is analogous to torch_ext::create_userbuffers_tensor() but for NCCLWindowAllocator. +inline std::pair createNCCLWindowTensor( + ncclComm_t comm, at::IntArrayRef shape, torch::ScalarType dtype) +{ + // Calculate buffer size + int64_t buffer_size + = std::accumulate(shape.begin(), shape.end(), 1LL, std::multiplies()) * torch::elementSize(dtype); + + // Calculate strides + std::vector strides_vec(shape.size()); + if (!shape.empty()) + { + strides_vec[shape.size() - 1] = 1; + for (int64_t i = static_cast(shape.size()) - 1; i >= 1; --i) + { + strides_vec[i - 1] = strides_vec[i] * shape[i]; + } + } + + // Request buffer from allocator + auto& allocator = NCCLWindowAllocator::getInstance(); + auto buffer = allocator.requestBuffer(comm, buffer_size); + + // Defensive validation: ensure buffer is valid before proceeding + if (!buffer.isValid()) + { + std::ostringstream oss; + oss << "Failed to allocate NCCL window buffer: invalid buffer returned from requestBuffer " + << "(comm=" << static_cast(comm) << ", buffer_size=" << buffer_size << ")"; + throw std::runtime_error(oss.str()); + } + + // Create custom deleter that releases the buffer + auto deleter = [comm, ptr = buffer.ptr](void*) { NCCLWindowAllocator::getInstance().releaseBuffer(comm, ptr); }; + + // Create tensor from the buffer + auto tensor = torch::from_blob(buffer.ptr, shape, strides_vec, deleter, torch::dtype(dtype).device(torch::kCUDA)); + + return std::make_pair(tensor, buffer); +} + +} // namespace tensorrt_llm::common::nccl_util + +#endif // ENABLE_MULTI_DEVICE diff --git a/cpp/tensorrt_llm/common/opUtils.cpp b/cpp/tensorrt_llm/common/opUtils.cpp index 736cd1c48d..72d966e43d 100644 --- a/cpp/tensorrt_llm/common/opUtils.cpp +++ b/cpp/tensorrt_llm/common/opUtils.cpp @@ -16,6 +16,7 @@ */ #include "tensorrt_llm/common/opUtils.h" +#include "tensorrt_llm/common/ncclUtils.h" #include "tensorrt_llm/runtime/utils/mpiTags.h" #include "tensorrt_llm/runtime/utils/mpiUtils.h" @@ -112,7 +113,29 @@ std::shared_ptr getComm(std::set const& group) std::shared_ptr ncclComm(new ncclComm_t, [](ncclComm_t* comm) { - ncclCommDestroy(*comm); + if (!comm) + { + return; + } + + // STEP 1: Clean up resources and destroy NCCL communicator if it's valid + if (*comm) + { + // Clean up all registered resources FIRST + tensorrt_llm::common::nccl_util::NcclCommResourceManager::getInstance().cleanupResources(*comm); + + // Now destroy the NCCL communicator + ncclResult_t result = ncclCommDestroy(*comm); + if (result != ncclSuccess) + { + TLLM_LOG_WARNING("ncclCommDestroy failed with error: %d", result); + } + + // Clear the communicator value before freeing the pointer + *comm = nullptr; + } + + // STEP 2: Always free the pointer memory (regardless of whether *comm was valid) delete comm; }); #if defined(_WIN32) diff --git a/cpp/tensorrt_llm/kernels/userbuffers/ub_allocator.cpp b/cpp/tensorrt_llm/kernels/userbuffers/ub_allocator.cpp index e0f2d5cce2..2e3e6dde66 100644 --- a/cpp/tensorrt_llm/kernels/userbuffers/ub_allocator.cpp +++ b/cpp/tensorrt_llm/kernels/userbuffers/ub_allocator.cpp @@ -22,16 +22,8 @@ namespace tensorrt_llm::runtime::ub { UserBufferAllocator& UserBufferAllocator::Instance() { - if (use_nccl_symmetric) - { - static NCCLUserBufferAllocator _; - return _; - } - else - { - static UserBufferAllocator _; - return _; - } + static UserBufferAllocator _; + return _; } void UserBufferAllocator::initialize(tensorrt_llm::runtime::WorldConfig const& worldConfig) @@ -83,167 +75,4 @@ communicator* UserBufferAllocator::comm() return mUbComm; } -void NCCLUserBufferAllocator::initialize(tensorrt_llm::runtime::WorldConfig const& worldConfig) -{ - if (!isInitialized()) - { - TLLM_LOG_INFO("Initializing NCCLUserBufferAllocator"); - std::set group; - for (int i = 0; i < worldConfig.getSize(); i++) - { - group.insert(i); - } - mComm = getComm(group); - mIsInitialized = true; - } -} - -UBBuffer NCCLUserBufferAllocator::registerUBBuffer(size_t bytes) -{ - TLLM_CHECK(isInitialized()); - UBBuffer ub_buffer; - - auto& ncclHelper = getNCCLHelper(); - if (!ncclHelper.isLoaded()) - { - TLLM_THROW("NCCL library could not be loaded for dynamic symbol access"); - } - - auto ncclMemAllocFunc = ncclHelper.getNCCLMemAlloc(); - auto ncclCommWindowRegisterFunc = ncclHelper.getNCCLCommWindowRegister(); - - NCCLCHECK(ncclMemAllocFunc(&ub_buffer.addr, bytes)); - NCCLCHECK(ncclCommWindowRegisterFunc((*mComm), ub_buffer.addr, bytes, &ub_buffer.window, NCCL_WIN_COLL_SYMMETRIC)); - ub_buffer.handle = 5; - ub_buffer.size = bytes; - return ub_buffer; -} - -// Static member definitions -std::unique_ptr NCCLUserBufferAllocator::mNCCLHelper = nullptr; - -NCCLHelper& NCCLUserBufferAllocator::getNCCLHelper() -{ - if (!mNCCLHelper) - { - mNCCLHelper = std::make_unique(); - } - return *mNCCLHelper; -} - -// NCCLHelper implementation -NCCLHelper::NCCLHelper() - : mLibraryHandle(nullptr) - , mNCCLCommWindowRegister(nullptr) - , mNCCLMemAlloc(nullptr) - , mIsLoaded(false) -{ - loadNCCLLibrary(); -} - -NCCLHelper::~NCCLHelper() -{ - if (mLibraryHandle) - { -#ifdef _WIN32 - FreeLibrary(mLibraryHandle); -#else - dlclose(mLibraryHandle); -#endif - mLibraryHandle = nullptr; - } -} - -void NCCLHelper::loadNCCLLibrary() -{ - try - { -#ifdef _WIN32 - char const* libraryNames[] = {"nccl.dll"}; -#else - char const* libraryNames[] = {"libnccl.so"}; -#endif - - for (int i = 0; libraryNames[i] != nullptr; ++i) - { - mLibraryHandle = loadLibraryHandle(libraryNames[i]); - if (mLibraryHandle) - { - TLLM_LOG_INFO("Successfully loaded NCCL library: %s", libraryNames[i]); - break; - } - } - - if (!mLibraryHandle) - { - TLLM_LOG_WARNING("Failed to load NCCL library"); - return; - } - - // Load the required symbols - mNCCLCommWindowRegister - = reinterpret_cast(getSymbolAddress(mLibraryHandle, "ncclCommWindowRegister")); - - mNCCLMemAlloc = reinterpret_cast(getSymbolAddress(mLibraryHandle, "ncclMemAlloc")); - - if (mNCCLCommWindowRegister == nullptr) - { - TLLM_LOG_WARNING("Failed to load ncclCommWindowRegister symbol, NCCL symmetric will not be supported."); - } - - if (mNCCLMemAlloc) - { - mIsLoaded = true; - } - else - { - TLLM_LOG_WARNING("Failed to load required NCCL symbols"); - } - } - catch (std::exception const& e) - { - TLLM_LOG_WARNING("Exception while loading NCCL library: %s", e.what()); - } -} - -void* NCCLHelper::loadLibraryHandle(char const* libName) -{ -#ifdef _WIN32 - return LoadLibraryA(libName); -#else - return dlopen(libName, RTLD_LAZY | RTLD_GLOBAL); -#endif -} - -void* NCCLHelper::getSymbolAddress(void* handle, char const* symbolName) -{ - if (!handle) - { - return nullptr; - } - -#ifdef _WIN32 - return GetProcAddress(static_cast(handle), symbolName); -#else - return dlsym(handle, symbolName); -#endif -} - -NCCLHelper::ncclCommWindowRegisterFunc NCCLHelper::getNCCLCommWindowRegister() -{ - return mNCCLCommWindowRegister; -} - -NCCLHelper::ncclMemAllocFunc NCCLHelper::getNCCLMemAlloc() -{ - return mNCCLMemAlloc; -} - -bool NCCLHelper::isLoaded() const -{ - return mIsLoaded; -} - -bool UserBufferAllocator::use_nccl_symmetric = false; - }; // namespace tensorrt_llm::runtime::ub diff --git a/cpp/tensorrt_llm/kernels/userbuffers/ub_allocator.h b/cpp/tensorrt_llm/kernels/userbuffers/ub_allocator.h index 4cc9149705..05a4b6dd4e 100644 --- a/cpp/tensorrt_llm/kernels/userbuffers/ub_allocator.h +++ b/cpp/tensorrt_llm/kernels/userbuffers/ub_allocator.h @@ -19,11 +19,6 @@ #if ENABLE_MULTI_DEVICE #include "nccl.h" #include "userbuffers.h" -#ifdef _WIN32 -#include -#else -#include -#endif #else using ncclWindow_t = void*; #endif @@ -69,8 +64,6 @@ public: communicator* comm(); virtual UBBuffer registerUBBuffer(size_t bytes); - static bool use_nccl_symmetric; - private: communicator* mUbComm; @@ -80,55 +73,6 @@ protected: tensorrt_llm::runtime::WorldConfig mWorldConfig; }; -class NCCLHelper -{ -public: - NCCLHelper(); - ~NCCLHelper(); - - // Dynamic loading function type definition - using ncclCommWindowRegisterFunc = ncclResult_t (*)(ncclComm_t, void*, size_t, ncclWindow_t*, int); - using ncclMemAllocFunc = ncclResult_t (*)(void**, size_t); - - // Get function pointer for ncclCommWindowRegister - ncclCommWindowRegisterFunc getNCCLCommWindowRegister(); - - // Get function pointer for ncclMemAlloc - ncclMemAllocFunc getNCCLMemAlloc(); - - // Check if NCCL library is successfully loaded - bool isLoaded() const; - -private: - void loadNCCLLibrary(); - void* loadLibraryHandle(char const* libName); - void* getSymbolAddress(void* handle, char const* symbolName); - -#ifdef _WIN32 - HMODULE mLibraryHandle; -#else - void* mLibraryHandle; -#endif - - ncclCommWindowRegisterFunc mNCCLCommWindowRegister; - ncclMemAllocFunc mNCCLMemAlloc; - bool mIsLoaded; -}; - -class NCCLUserBufferAllocator : public UserBufferAllocator -{ -public: - void initialize(tensorrt_llm::runtime::WorldConfig const& world_config) override; - UBBuffer registerUBBuffer(size_t bytes) override; - - // Get shared NCCLHelper instance - static NCCLHelper& getNCCLHelper(); - -private: - std::shared_ptr mComm; - static std::unique_ptr mNCCLHelper; -}; - #else using communicator = void; #endif diff --git a/cpp/tensorrt_llm/kernels/userbuffers/userbuffersManager.cpp b/cpp/tensorrt_llm/kernels/userbuffers/userbuffersManager.cpp index a1fcd3c01f..df2a549b8d 100644 --- a/cpp/tensorrt_llm/kernels/userbuffers/userbuffersManager.cpp +++ b/cpp/tensorrt_llm/kernels/userbuffers/userbuffersManager.cpp @@ -14,6 +14,7 @@ * limitations under the License. */ #include "userbuffersManager.h" +#include "tensorrt_llm/common/logger.h" namespace tensorrt_llm::runtime::ub { @@ -29,14 +30,11 @@ UserBuffersManager& UserBuffersManager::get_instance() return allocator; } -void UserBuffersManager::initialize(int64_t tp_size, int64_t pp_size, int64_t cp_size, int64_t rank, - int64_t gpus_per_node, int64_t buffer_size, bool use_nccl_symmetric) +void UserBuffersManager::initialize( + int64_t tp_size, int64_t pp_size, int64_t cp_size, int64_t rank, int64_t gpus_per_node, int64_t buffer_size) { std::lock_guard lock(mutex_); tensorrt_llm::runtime::WorldConfig world_config(tp_size, pp_size, cp_size, rank, gpus_per_node); -#if ENABLE_MULTI_DEVICE - UserBufferAllocator::Instance().use_nccl_symmetric = use_nccl_symmetric; -#endif tensorrt_llm::runtime::ub::ub_initialize(world_config); TLLM_CHECK(tensorrt_llm::runtime::ub::ub_is_initialized()); buffer_size_ = buffer_size; @@ -98,11 +96,10 @@ tensorrt_llm::runtime::ub::communicator* UserBuffersManager::comm() return tensorrt_llm::runtime::ub::ub_comm(); } -void initialize_userbuffers_manager(int64_t tp_size, int64_t pp_size, int64_t cp_size, int64_t rank, - int64_t gpus_per_node, int64_t buffer_size, bool use_nccl_symmetric) +void initialize_userbuffers_manager( + int64_t tp_size, int64_t pp_size, int64_t cp_size, int64_t rank, int64_t gpus_per_node, int64_t buffer_size) { - UserBuffersManager::get_instance().initialize( - tp_size, pp_size, cp_size, rank, gpus_per_node, buffer_size, use_nccl_symmetric); + UserBuffersManager::get_instance().initialize(tp_size, pp_size, cp_size, rank, gpus_per_node, buffer_size); } } // namespace tensorrt_llm::runtime::ub diff --git a/cpp/tensorrt_llm/kernels/userbuffers/userbuffersManager.h b/cpp/tensorrt_llm/kernels/userbuffers/userbuffersManager.h index 1b34f8e8a1..7ec39db602 100644 --- a/cpp/tensorrt_llm/kernels/userbuffers/userbuffersManager.h +++ b/cpp/tensorrt_llm/kernels/userbuffers/userbuffersManager.h @@ -46,9 +46,8 @@ public: //! @param gpus_per_node The number of GPUs per node. //! @param buffer_size The size of the buffer to allocate. All buffers allocated by this manager will have this //! size. - //! @param use_nccl_symmetric Whether to use NCCL symmetric communication. - void initialize(int64_t tp_size, int64_t pp_size, int64_t cp_size, int64_t rank, int64_t gpus_per_node, - int64_t buffer_size, bool use_nccl_symmetric); + void initialize( + int64_t tp_size, int64_t pp_size, int64_t cp_size, int64_t rank, int64_t gpus_per_node, int64_t buffer_size); //! @brief Create a UB tensor from the given shape, strides and data type. The function will choose available UB //! buffer or create a new one if no available buffer is found. @@ -76,7 +75,7 @@ private: int64_t buffer_size_; }; -void initialize_userbuffers_manager(int64_t tp_size, int64_t pp_size, int64_t cp_size, int64_t rank, - int64_t gpus_per_node, int64_t buffer_size, bool use_nccl_symmetric); +void initialize_userbuffers_manager( + int64_t tp_size, int64_t pp_size, int64_t cp_size, int64_t rank, int64_t gpus_per_node, int64_t buffer_size); } // namespace tensorrt_llm::runtime::ub diff --git a/cpp/tensorrt_llm/plugins/ncclPlugin/allreducePlugin.cpp b/cpp/tensorrt_llm/plugins/ncclPlugin/allreducePlugin.cpp index 4241cf8d85..112364400d 100644 --- a/cpp/tensorrt_llm/plugins/ncclPlugin/allreducePlugin.cpp +++ b/cpp/tensorrt_llm/plugins/ncclPlugin/allreducePlugin.cpp @@ -137,13 +137,12 @@ bool AllreducePlugin::supportsFormatCombination( int pos, nvinfer1::PluginTensorDesc const* inOut, int nbInputs, int nbOutputs) noexcept { int base_inputs = 0; - if (mStrategy == AllReduceStrategyType::NCCL || mStrategy == AllReduceStrategyType::UB) + switch (mStrategy) { - base_inputs = 1; - } - else - { - base_inputs = 2; + case AllReduceStrategyType::NCCL: + case AllReduceStrategyType::UB: + case AllReduceStrategyType::NCCL_SYMMETRIC: base_inputs = 1; break; + default: base_inputs = 2; break; } int fusion_op_extra_inputs = 0; int scale_idx = 0; @@ -169,9 +168,15 @@ bool AllreducePlugin::supportsFormatCombination( TLLM_CHECK(nbInputs == (base_inputs + fusion_op_extra_inputs)); - if (mStrategy != AllReduceStrategyType::NCCL && mStrategy != AllReduceStrategyType::UB && pos == 1) + if (pos == 1) { - return (inOut[pos].type == nvinfer1::DataType::kINT64) && (inOut[pos].format == TensorFormat::kLINEAR); + switch (mStrategy) + { + case AllReduceStrategyType::NCCL: + case AllReduceStrategyType::UB: + case AllReduceStrategyType::NCCL_SYMMETRIC: break; + default: return (inOut[pos].type == nvinfer1::DataType::kINT64) && (inOut[pos].format == TensorFormat::kLINEAR); + } } if (mStrategy == AllReduceStrategyType::UB) { @@ -222,25 +227,26 @@ AllReduceStrategyType AllreducePlugin::selectImplementation( { if (!isAuto) { - TLLM_LOG_INFO("Since Peer to Peer not supported, fallback to AllReduceStrategy: NCCL"); + TLLM_LOG_INFO("Since Peer to Peer not supported, fallback to AllReduceStrategy: NCCL_SYMMETRIC"); } else if (forceDeterministic) { TLLM_LOG_WARNING( - "Since Peer to Peer not supported, fallback to AllReduceStrategy: NCCL. NCCL might produce " + "Since Peer to Peer not supported, fallback to AllReduceStrategy: NCCL_SYMMETRIC. NCCL_SYMMETRIC might " + "produce " "non-deterministic results."); } - return AllReduceStrategyType::NCCL; + return AllReduceStrategyType::NCCL_SYMMETRIC; } if (isAuto && !mIsNVLINKSupported && !forceDeterministic) { - return AllReduceStrategyType::NCCL; + return AllReduceStrategyType::NCCL_SYMMETRIC; } auto const maxWorkspaceSize = utils::customAllReduceUtils::getMaxRequiredWorkspaceSize(worldSize); - AllReduceStrategyType strat = AllReduceStrategyType::NCCL; + AllReduceStrategyType strat = AllReduceStrategyType::NCCL_SYMMETRIC; auto const messageSizeBytes = messageSize * common::getDTypeSize(type); if (messageSizeBytes <= maxWorkspaceSize) @@ -268,7 +274,7 @@ AllReduceStrategyType AllreducePlugin::selectImplementation( } else { - strat = AllReduceStrategyType::NCCL; + strat = AllReduceStrategyType::NCCL_SYMMETRIC; } } else @@ -279,7 +285,7 @@ AllReduceStrategyType AllreducePlugin::selectImplementation( } else { - strat = AllReduceStrategyType::NCCL; + strat = AllReduceStrategyType::NCCL_SYMMETRIC; } } @@ -287,30 +293,31 @@ AllReduceStrategyType AllreducePlugin::selectImplementation( { if (!isAuto) { - TLLM_LOG_WARNING("Since not aligned, fallback to AllReduceStrategy: NCCL"); + TLLM_LOG_WARNING("Since not aligned, fallback to AllReduceStrategy: NCCL_SYMMETRIC"); } else if (forceDeterministic) { TLLM_LOG_WARNING( - "Since not aligned, fallback to AllReduceStrategy: NCCL. NCCL might produce " + "Since not aligned, fallback to AllReduceStrategy: NCCL_SYMMETRIC. NCCL_SYMMETRIC might produce " "non-deterministic results."); } - strat = AllReduceStrategyType::NCCL; + strat = AllReduceStrategyType::NCCL_SYMMETRIC; } } else { if (!isAuto) { - TLLM_LOG_WARNING("Since messageSize > maxWorkspace, fallback to AllReduceStrategy: NCCL"); + TLLM_LOG_WARNING("Since messageSize > maxWorkspace, fallback to AllReduceStrategy: NCCL_SYMMETRIC"); } else if (forceDeterministic) { TLLM_LOG_WARNING( - "Since messageSize > maxWorkspace, fallback to AllReduceStrategy: NCCL. NCCL might produce " + "Since messageSize > maxWorkspace, fallback to AllReduceStrategy: NCCL_SYMMETRIC. NCCL_SYMMETRIC might " + "produce " "non-deterministic results."); } - strat = AllReduceStrategyType::NCCL; + strat = AllReduceStrategyType::NCCL_SYMMETRIC; } return strat; @@ -337,6 +344,10 @@ int AllreducePlugin::enqueue(nvinfer1::PluginTensorDesc const* inputDesc, nvinfe { runtimeStrategy = AllReduceStrategyType::NCCL; } + else if (mStrategy == AllReduceStrategyType::NCCL_SYMMETRIC) + { + runtimeStrategy = AllReduceStrategyType::NCCL_SYMMETRIC; + } else if (mStrategy == AllReduceStrategyType::UB) { runtimeStrategy = AllReduceStrategyType::UB; @@ -355,6 +366,11 @@ int AllreducePlugin::enqueue(nvinfer1::PluginTensorDesc const* inputDesc, nvinfe TLLM_LOG_DEBUG("AllReducePlugin strategy for rank %d: NCCL", rank); break; } + case AllReduceStrategyType::NCCL_SYMMETRIC: + { + TLLM_LOG_DEBUG("AllReducePlugin strategy for rank %d: NCCL_SYMMETRIC", rank); + break; + } case AllReduceStrategyType::ONESHOT: { TLLM_LOG_DEBUG("AllReducePlugin strategy for rank %d: ONESHOT", rank); @@ -373,14 +389,14 @@ int AllreducePlugin::enqueue(nvinfer1::PluginTensorDesc const* inputDesc, nvinfe default: break; } - if (runtimeStrategy == AllReduceStrategyType::NCCL) + if (runtimeStrategy == AllReduceStrategyType::NCCL || runtimeStrategy == AllReduceStrategyType::NCCL_SYMMETRIC) { if (mOp == AllReduceFusionOp::RESIDUAL_RMS_NORM || mOp == AllReduceFusionOp::RESIDUAL_RMS_PREPOST_NORM) { NCCLCHECK(ncclAllReduce(inputs[0], outputs[1], size, (*getDtypeMap())[mType], ncclSum, *mNcclComm, stream)); tensorrt_llm::kernels::AllReduceParams params; int fusion_ptr_idx = 0; - if (mStrategy == AllReduceStrategyType::NCCL) + if (mStrategy == AllReduceStrategyType::NCCL || mStrategy == AllReduceStrategyType::NCCL_SYMMETRIC) { fusion_ptr_idx = 1; } diff --git a/cpp/tensorrt_llm/thop/allreduceOp.cpp b/cpp/tensorrt_llm/thop/allreduceOp.cpp index 21018e241d..fbd60d1ec5 100644 --- a/cpp/tensorrt_llm/thop/allreduceOp.cpp +++ b/cpp/tensorrt_llm/thop/allreduceOp.cpp @@ -15,10 +15,12 @@ * limitations under the License. */ +#include "tensorrt_llm/common/cudaDriverWrapper.h" #include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/common/customAllReduceUtils.h" #include "tensorrt_llm/common/dataType.h" #include "tensorrt_llm/common/mcastDevMemUtils.h" +#include "tensorrt_llm/common/ncclUtils.h" #include "tensorrt_llm/common/opUtils.h" #include "tensorrt_llm/kernels/communicationKernels/allReduceFusionKernels.h" #include "tensorrt_llm/kernels/communicationKernels/customLowPrecisionAllReduceKernels.h" @@ -39,6 +41,7 @@ #if ENABLE_MULTI_DEVICE #include #include +#include #include #include #include @@ -51,6 +54,7 @@ #include #include +#include #include // using namespace nvinfer1; @@ -238,6 +242,9 @@ public: AllreduceOp( std::set group, nvinfer1::DataType type, AllReduceStrategyType strategy, AllReduceFusionOp op, float eps) : mGroup(std::move(group)) + , mIsNVLINKSupported(false) + , mIsP2PSupported(false) + , mIsMNNVLSupported(false) , mType(type) , mStrategy(strategy) , mOp(op) @@ -248,6 +255,9 @@ public: AllreduceOp(std::set group, c10::intrusive_ptr const& process_group_, nvinfer1::DataType type, AllReduceStrategyType strategy, AllReduceFusionOp op, float eps) : mGroup(std::move(group)) + , mIsNVLINKSupported(false) + , mIsP2PSupported(false) + , mIsMNNVLSupported(false) , mType(type) , mStrategy(strategy) , mOp(op) @@ -437,44 +447,117 @@ private: torch::optional const& residual, torch::optional const& norm_weight, torch::optional const& scale, torch::optional const& bias) { + // Handle ProcessGroup path first - cannot extract NCCL comm for window registration + // Use ProcessGroup's allreduce directly and return early + if (mNcclComm.index() == 1) + { + auto torchPg = std::get<1>(mNcclComm); + + torch::Tensor reduceOutput = input.clone(); + std::vector tensors{reduceOutput}; + PGCHECK_THROW(torchPg->allreduce(tensors, {c10d::ReduceOp::SUM})); + + if (mOp == AllReduceFusionOp::NONE) + { + return {reduceOutput}; + } + + // Treat any other patterns as fallback cases. + return fallbackRunSubsequentOps(input, residual, norm_weight, scale, bias, reduceOutput); + } + + // From here on, we have a raw NCCL comm - can proceed with window registration + auto rawComm = std::get<0>(mNcclComm); + ncclComm_t comm = *rawComm; + TLLM_CHECK_WITH_INFO(comm != nullptr, "NCCL communicator is null"); + TLLM_LOG_DEBUG("[runNCCLAllReduceSymmetric] Using raw NCCL comm path (not ProcessGroup)"); + + using tensorrt_llm::common::nccl_util::NCCLWindowAllocator; + using tensorrt_llm::common::nccl_util::createNCCLWindowTensor; auto stream = at::cuda::getCurrentCUDAStream(input.get_device()); int size = input.numel(); - auto& ub_manager = tensorrt_llm::runtime::ub::UserBuffersManager::get_instance(); - auto ub_tensor0 = input; - auto ub_buffer0 = ub_manager.search_buffer(input.data_ptr()); - if (ub_buffer0.invalid()) + size_t bufferSizeBytes = size * input.element_size(); + + // Using unregistered input buffers with NCCL symmetric, requires a memcpy + // This is an overhead introduced with using NCCL_SYMMTRIC over NCCL. + // Both the memcpy and the perf benefit from using NCCL_SYMMETRIC scale linear with the message size. + // But a local memcpy is cheaper than the remote operations, so with larger message sizes the benefit is + // stronger. Additionally, the perf benefit scales with the number of ranks, since multimem enables O(const.) + // versus O(N) complexity. Hence we model this cutoff with a linear model. The numbers below were obtained on + // GB200, scanning different message sizes and ranks. You can determine the regression onset for each number of + // ranks to a single message size. And the following formula was obtained by fitting a linear model to the + // regression onset. It is possible to override this empirical heuristic with the TLLM_NCCL_MIN_REGISTRATION + // environment variable. + double const a = -4986.43478503; + double const b = 156716.52177552; + int nRanks; + NCCLCHECK_THROW(ncclCommCount(comm, &nRanks)); + size_t minRegistrationThreshold = static_cast(std::max(0.0, a * nRanks + b)) * input.element_size(); + // Disable window registration if neither NVLink nor MNNVL is supported + // TODO replace in NCCL 2.29 with comm query + if (!mIsNVLINKSupported && !mIsMNNVLSupported) { - auto [symmetric_input, symmetric_ub_buffer0] - = torch_ext::create_userbuffers_tensor(input.sizes(), input.scalar_type()); - cudaMemcpyAsync(symmetric_ub_buffer0.addr, input.data_ptr(), size * input.element_size(), - cudaMemcpyDeviceToDevice, stream); - ub_buffer0 = symmetric_ub_buffer0; - ub_tensor0 = symmetric_input; + minRegistrationThreshold = std::numeric_limits::max(); + } + char const* envThreshold = std::getenv("TLLM_NCCL_MIN_REGISTRATION"); + if (envThreshold != nullptr) + { + minRegistrationThreshold = static_cast(std::atoi(envThreshold)) * input.element_size(); } - TLLM_CHECK(!ub_buffer0.invalid()); - auto [norm_out, ub_buffer1] = torch_ext::create_userbuffers_tensor(input.sizes(), input.scalar_type()); + // Search for existing buffer + auto& allocator = NCCLWindowAllocator::getInstance(); + auto windowBuffer0 = allocator.searchBuffer(comm, input.data_ptr()); - std::visit(overloaded{[&, norm_out_ = norm_out](std::shared_ptr& rawComm) - { - NCCLCHECK_THROW(ncclAllReduce(ub_buffer0.addr, norm_out_.mutable_data_ptr(), size, - (*getDtypeMap())[mType], ncclSum, *rawComm, stream)); - }, - [&, norm_out_ = norm_out](c10::intrusive_ptr& torchPg) - { - PGCHECK_THROW(PgHelper{torchPg}.allreduce(ub_tensor0, {c10d::ReduceOp::SUM})); - std::ignore = norm_out_.copy_(ub_tensor0, true); - }}, - mNcclComm); + torch::Tensor inputTensor = input; + void* inputPtr = input.data_ptr(); + + // If buffer is not registered, decide whether to register based on size + if (!windowBuffer0.isValid()) + { + if (bufferSizeBytes < minRegistrationThreshold) + { + // Small buffer: use input directly without window registration + TLLM_LOG_DEBUG( + "[runNCCLAllReduceSymmetric] Buffer size %zu bytes < threshold %zu bytes, " + "skipping window registration", + bufferSizeBytes, minRegistrationThreshold); + // inputTensor and inputPtr remain pointing to original input + } + else + { + // Large buffer: create window buffer and copy input (can swap inputTensor reference) + auto [symmetricInput, symmetricBuffer0] + = createNCCLWindowTensor(comm, input.sizes(), input.scalar_type()); + TLLM_CUDA_CHECK(cudaMemcpyAsync( + symmetricBuffer0.ptr, input.data_ptr(), bufferSizeBytes, cudaMemcpyDeviceToDevice, stream)); + windowBuffer0 = symmetricBuffer0; + inputTensor = symmetricInput; // Swap to window-backed tensor + inputPtr = windowBuffer0.ptr; + } + } + else + { + // Buffer already registered - use it directly + inputPtr = windowBuffer0.ptr; + } + + // Use window-backed output buffer + auto [normOut, windowBuffer1] = createNCCLWindowTensor(comm, input.sizes(), input.scalar_type()); + torch::Tensor outputTensor = normOut; + void* outputPtr = windowBuffer1.ptr; + + // Perform allreduce + NCCLCHECK_THROW(ncclAllReduce(inputPtr, outputPtr, size, (*getDtypeMap())[mType], ncclSum, comm, stream)); if (mOp == AllReduceFusionOp::NONE) { - return {norm_out}; + return {outputTensor}; } // Treat any other patterns as fallback cases. - return fallbackRunSubsequentOps(input, residual, norm_weight, scale, bias, norm_out); + return fallbackRunSubsequentOps(input, residual, norm_weight, scale, bias, outputTensor); } std::vector runLowPrecisionAllReduce(torch::Tensor const& input, @@ -799,16 +882,104 @@ private: void initGroupTopology() { - static std::map, std::tuple> cache; + static std::map, std::tuple> cache; if (cache.find(mGroup) != cache.end()) { - auto [is_NVLINK_supported, is_P2P_supported] = cache[mGroup]; + auto [is_NVLINK_supported, is_P2P_supported, is_MNNVL_supported] = cache[mGroup]; mIsNVLINKSupported = is_NVLINK_supported; mIsP2PSupported = is_P2P_supported; + mIsMNNVLSupported = is_MNNVL_supported; return; } setGroupTopology(); - cache[mGroup] = {mIsNVLINKSupported, mIsP2PSupported}; + cache[mGroup] = {mIsNVLINKSupported, mIsP2PSupported, mIsMNNVLSupported}; + } + + bool checkMNNVLSupport(int device_id) + { +#if ENABLE_MULTI_DEVICE + // 1. Check CUDA driver version (needs >= 12.0.10) + int cuda_driver_version = -1; + TLLM_CUDA_CHECK(cudaDriverGetVersion(&cuda_driver_version)); + if (cuda_driver_version < 12010) + { + TLLM_LOG_DEBUG("MNNVL check: CUDA Driver version %d < 12010", cuda_driver_version); + return false; + } + + // 2. Check multicast support + CUdevice cu_device; + TLLM_CU_CHECK(cuDeviceGet(&cu_device, device_id)); + auto cuda_driver = tensorrt_llm::common::CUDADriverWrapper::getInstance(); + + int multicast_supported = 0; + TLLM_CU_CHECK(cuda_driver->cuDeviceGetAttribute( + &multicast_supported, CU_DEVICE_ATTRIBUTE_MULTICAST_SUPPORTED, cu_device)); + if (!multicast_supported) + { + TLLM_LOG_DEBUG("MNNVL check: Device %d does not support multicast", device_id); + return false; + } + + // 3. Check fabric handle support + int fabric_handle_supported = 0; + TLLM_CU_CHECK(cuda_driver->cuDeviceGetAttribute( + &fabric_handle_supported, CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED, cu_device)); + if (!fabric_handle_supported) + { + TLLM_LOG_DEBUG("MNNVL check: Device %d does not support fabric handles", device_id); + return false; + } + + // 4. Check NVML GPU Fabric Info + nvmlDevice_t nvml_device; + NVML_CHECK_THROW(nvmlDeviceGetHandleByIndex(device_id, &nvml_device)); + + nvmlGpuFabricInfo_t fabric_info; + NVML_CHECK_THROW(nvmlDeviceGetGpuFabricInfo(nvml_device, &fabric_info)); + + // Check if fabric is fully initialized + if (fabric_info.state != NVML_GPU_FABRIC_STATE_COMPLETED || fabric_info.status != NVML_SUCCESS) + { + TLLM_LOG_DEBUG( + "MNNVL check: Fabric state not complete - state=%u status=%u", fabric_info.state, fabric_info.status); + return false; + } + + // 5. Check NVLink links are active (similar to Python support_nvlink(True)) + unsigned int active_links = 0; + unsigned int available_links = 0; + + for (unsigned int link = 0; link < NVML_NVLINK_MAX_LINKS; link++) + { + unsigned int cap_p2p = 0; + nvmlReturn_t cap_result + = nvmlDeviceGetNvLinkCapability(nvml_device, link, NVML_NVLINK_CAP_P2P_SUPPORTED, &cap_p2p); + if (cap_result == NVML_SUCCESS && cap_p2p) + { + available_links++; + nvmlEnableState_t link_state; + if (nvmlDeviceGetNvLinkState(nvml_device, link, &link_state) == NVML_SUCCESS + && link_state == NVML_FEATURE_ENABLED) + { + active_links++; + } + } + } + + bool all_links_up = (active_links == available_links && available_links > 0); + if (!all_links_up) + { + TLLM_LOG_DEBUG( + "MNNVL check: Not all NVLink links active - active=%u available=%u", active_links, available_links); + return false; + } + + TLLM_LOG_INFO("MNNVL check: Device %d supports MNNVL (fabric_clique=%u)", device_id, fabric_info.cliqueId); + return true; +#else + return false; +#endif } void setGroupTopology() @@ -820,108 +991,190 @@ private: [&](c10::intrusive_ptr& torchPg) { return getLocalGroupTorch(mGroup); }}, mNcclComm); - if (mGroup.size() != local_group.size()) - { - mIsP2PSupported = false; - mIsNVLINKSupported = false; - TLLM_LOG_INFO("Found inter-node TP group for rank %d", rank); - return; - } - TLLM_LOG_INFO("TP group is intra-node for rank %d", rank); + bool is_inter_node = (mGroup.size() != local_group.size()); NvmlManager nvml_manager; mIsP2PSupported = true; mIsNVLINKSupported = true; + mIsMNNVLSupported = false; - // TODO(ytong): Should we provide group topology info instead of querying it here? - // Use cudaDeviceCanAccessPeer to determine whether p2p is supported, - // and use nvml to determine whether there are nvlink links between ranks. - for (int first_device_id : local_group) + // First, check NVLink within local group (intra-node) + if (!local_group.empty()) { - for (int second_device_id : local_group) + for (int first_device_id : local_group) { - if (first_device_id >= second_device_id) + for (int second_device_id : local_group) { - continue; - } - - int can_access_peer = 0; - TLLM_CUDA_CHECK(cudaDeviceCanAccessPeer(&can_access_peer, first_device_id, second_device_id)); - - if (!can_access_peer) - { - mIsP2PSupported = false; - mIsNVLINKSupported = false; - - return; - } - - nvmlDevice_t first_device; - NVML_CHECK_THROW(nvmlDeviceGetHandleByIndex(first_device_id, &first_device)); - - bool is_NVLINK = false; - - for (unsigned int link = 0; link < NVML_NVLINK_MAX_LINKS; link++) - { - nvmlPciInfo_t remote_pci_info; - if (nvmlDeviceGetNvLinkRemotePciInfo_v2(first_device, link, &remote_pci_info) != NVML_SUCCESS) + if (first_device_id >= second_device_id) { continue; } - nvmlDevice_t remote_device; - auto const result = nvmlDeviceGetHandleByPciBusId_v2(remote_pci_info.busId, &remote_device); + int can_access_peer = 0; + TLLM_CUDA_CHECK(cudaDeviceCanAccessPeer(&can_access_peer, first_device_id, second_device_id)); - if (result == NVML_SUCCESS) + if (!can_access_peer) { - // Two GPUs are connected directly through nvlink - unsigned int remote_device_id; - NVML_CHECK_THROW(nvmlDeviceGetIndex(remote_device, &remote_device_id)); - - if (remote_device_id == static_cast(second_device_id)) - { - is_NVLINK = true; - } + mIsP2PSupported = false; + mIsNVLINKSupported = false; + TLLM_LOG_INFO( + "P2P not supported between local devices %d and %d", first_device_id, second_device_id); + // Continue checking other pairs, but mark as not supported + continue; } - else if (result == NVML_ERROR_NOT_FOUND) + + nvmlDevice_t first_device; + NVML_CHECK_THROW(nvmlDeviceGetHandleByIndex(first_device_id, &first_device)); + + bool is_NVLINK = false; + + for (unsigned int link = 0; link < NVML_NVLINK_MAX_LINKS; link++) { - // Maybe Two GPUs are connected via nvswitch, - // now remotePciInfo represents the pci information of nvswitch, - // determine whether nvlink is supported by whether two GPUs are connected to the same - // nvswitch. - nvmlDevice_t second_device; - NVML_CHECK_THROW(nvmlDeviceGetHandleByIndex(second_device_id, &second_device)); - - for (unsigned int second_link = 0; second_link < NVML_NVLINK_MAX_LINKS; second_link++) + nvmlPciInfo_t remote_pci_info; + if (nvmlDeviceGetNvLinkRemotePciInfo_v2(first_device, link, &remote_pci_info) != NVML_SUCCESS) { - nvmlPciInfo_t second_remote_pci_info; - if (nvmlDeviceGetNvLinkRemotePciInfo_v2(second_device, second_link, &second_remote_pci_info) - != NVML_SUCCESS) - { - continue; - } + continue; + } - if (strcmp(remote_pci_info.busId, second_remote_pci_info.busId) == 0) + nvmlDevice_t remote_device; + auto const result = nvmlDeviceGetHandleByPciBusId_v2(remote_pci_info.busId, &remote_device); + + if (result == NVML_SUCCESS) + { + // Two GPUs are connected directly through nvlink + unsigned int remote_device_id; + NVML_CHECK_THROW(nvmlDeviceGetIndex(remote_device, &remote_device_id)); + + if (remote_device_id == static_cast(second_device_id)) { is_NVLINK = true; - break; } } - } - else - { - NVML_CHECK_THROW(result); + else if (result == NVML_ERROR_NOT_FOUND) + { + // Maybe Two GPUs are connected via nvswitch, + // now remotePciInfo represents the pci information of nvswitch, + // determine whether nvlink is supported by whether two GPUs are connected to the same + // nvswitch. + nvmlDevice_t second_device; + NVML_CHECK_THROW(nvmlDeviceGetHandleByIndex(second_device_id, &second_device)); + + for (unsigned int second_link = 0; second_link < NVML_NVLINK_MAX_LINKS; second_link++) + { + nvmlPciInfo_t second_remote_pci_info; + if (nvmlDeviceGetNvLinkRemotePciInfo_v2( + second_device, second_link, &second_remote_pci_info) + != NVML_SUCCESS) + { + continue; + } + + if (strcmp(remote_pci_info.busId, second_remote_pci_info.busId) == 0) + { + is_NVLINK = true; + break; + } + } + } + else + { + NVML_CHECK_THROW(result); + } + + if (is_NVLINK) + { + break; + } } - if (is_NVLINK) - { - break; - } + mIsNVLINKSupported &= is_NVLINK; } - - mIsNVLINKSupported &= is_NVLINK; } } + + // For inter-node groups, check MNNVL support + if (is_inter_node) + { + TLLM_LOG_INFO("Found inter-node TP group for rank %d, checking MNNVL support", rank); + + // Check MNNVL support on local device(s) + bool local_mnnvl_supported = false; + if (!local_group.empty()) + { + // Check MNNVL on first device in local group (all devices on same node should have same MNNVL status) + int check_device = *local_group.begin(); + local_mnnvl_supported = checkMNNVLSupport(check_device); + } + + // Gather MNNVL status from all ranks in the group + int local_mnnvl_status = local_mnnvl_supported ? 1 : 0; + std::vector all_mnnvl_status(mGroup.size()); + + std::visit(overloaded{[&](std::shared_ptr& comm_ptr) + { + // For NCCL comm, use MPI to gather status + // Use MPI allgather to collect MNNVL status + // Create a sub-communicator for the group + std::vector group_ranks(mGroup.begin(), mGroup.end()); + MPI_Group world_group, new_group; + MPI_Comm group_comm; + MPI_Comm_group(COMM_SESSION, &world_group); + MPI_Group_incl(world_group, group_ranks.size(), group_ranks.data(), &new_group); + MPI_Comm_create_group(COMM_SESSION, new_group, 0, &group_comm); + + if (group_comm != MPI_COMM_NULL) + { + MPI_Allgather(&local_mnnvl_status, 1, MPI_INT, all_mnnvl_status.data(), 1, MPI_INT, + group_comm); + MPI_Comm_free(&group_comm); + } + MPI_Group_free(&new_group); + MPI_Group_free(&world_group); + }, + [&](c10::intrusive_ptr& torchPg) + { + // For ProcessGroup, use allgather directly + // Note: This assumes the ProcessGroup is already set up for the correct group + std::vector input_tensors + = {torch::tensor({local_mnnvl_status}, torch::kInt32)}; + std::vector> output_tensors(1); + output_tensors[0].resize(mGroup.size()); + auto work = torchPg->allgather(output_tensors, input_tensors); + if (work) + { + work->wait(); + for (size_t i = 0; i < mGroup.size(); ++i) + { + all_mnnvl_status[i] = output_tensors[0][i].item(); + } + } + }}, + mNcclComm); + + // Check if all ranks support MNNVL + bool all_ranks_support_mnnvl = true; + for (int status : all_mnnvl_status) + { + if (status == 0) + { + all_ranks_support_mnnvl = false; + break; + } + } + + // For inter-node: MNNVL support means all nodes have MNNVL + // Also need local NVLink for optimal performance + mIsMNNVLSupported = mIsNVLINKSupported && all_ranks_support_mnnvl; + mIsP2PSupported = false; // P2P doesn't work across nodes + + TLLM_LOG_INFO("Inter-node topology: local_NVLink=%d, local_MNNVL=%d, all_ranks_MNNVL=%d, final_MNNVL=%d", + mIsNVLINKSupported ? 1 : 0, local_mnnvl_status, all_ranks_support_mnnvl ? 1 : 0, + mIsMNNVLSupported ? 1 : 0); + } + else + { + TLLM_LOG_INFO("TP group is intra-node for rank %d", rank); + } } AllReduceStrategyType selectImplementation(size_t seq_len, size_t hidden_size) @@ -951,12 +1204,12 @@ private: if (ifFallbackToNCCL(seq_len, message_size_bytes, max_workspace_size)) { - return AllReduceStrategyType::NCCL; + return AllReduceStrategyType::NCCL_SYMMETRIC; } - // This rule based heuristic only chooses between NCCL and MIN_LATENCY strategies. - // From this point, all fusion patterns are supported by all these strategies: NCCL, ONESHOT, TWOSHOT and - // MIN_LATENCY. + // This rule based heuristic only chooses between NCCL_SYMMETRIC and MIN_LATENCY strategies. + // From this point, all fusion patterns are supported by all these strategies: NCCL_SYMMETRIC, ONESHOT, TWOSHOT + // and MIN_LATENCY. if (mStrategy != AllReduceStrategyType::AUTO) { // Check TWOSHOT constraint: seq_len >= tp_size @@ -973,12 +1226,11 @@ private: return tensorrt_llm::utils::customAllReduceUtils::selectStrategyLookUpTable( seq_len, hidden_size, mOp, mGroup.size()); } - return AllReduceStrategyType::NCCL; } bool ifFallbackToNCCL(size_t seq_len, size_t message_size_bytes, size_t max_workspace_size) { - // If messageSize is less than maxWorkspaceSize, use NCCL, regardless of the fusion type. + // If messageSize is greater than maxWorkspaceSize or topology is unsuitable, use NCCL_SYMMETRIC fallback. if (message_size_bytes > max_workspace_size || !mIsP2PSupported || !mIsNVLINKSupported) { return true; @@ -1006,6 +1258,7 @@ private: std::set mGroup; bool mIsNVLINKSupported; bool mIsP2PSupported; + bool mIsMNNVLSupported; nvinfer1::DataType mType; AllReduceStrategyType mStrategy; AllReduceFusionOp mOp; diff --git a/cpp/tests/unit_tests/multi_gpu/CMakeLists.txt b/cpp/tests/unit_tests/multi_gpu/CMakeLists.txt index 5fb79c766c..44b8e30577 100644 --- a/cpp/tests/unit_tests/multi_gpu/CMakeLists.txt +++ b/cpp/tests/unit_tests/multi_gpu/CMakeLists.txt @@ -20,3 +20,9 @@ target_link_libraries(cacheTransceiverTest PRIVATE ${Python3_LIBRARIES}) add_gtest(mpiUtilsTest mpiUtilsTest.cpp) add_gtest(userBufferTest userBufferTest.cpp) +add_gtest(ncclUtilsTest ncclUtilsTest.cpp) +target_link_libraries(ncclUtilsTest PRIVATE ${Python3_LIBRARIES}) +if(BUILD_PYT) + target_compile_definitions(ncclUtilsTest PUBLIC BUILD_PYT) + target_link_libraries(ncclUtilsTest PUBLIC ${TORCH_LIBRARIES}) +endif() diff --git a/cpp/tests/unit_tests/multi_gpu/ncclUtilsTest.cpp b/cpp/tests/unit_tests/multi_gpu/ncclUtilsTest.cpp new file mode 100644 index 0000000000..bf4ddd2141 --- /dev/null +++ b/cpp/tests/unit_tests/multi_gpu/ncclUtilsTest.cpp @@ -0,0 +1,745 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "tensorrt_llm/common/ncclUtils.h" +#include "tensorrt_llm/common/assert.h" +#include "tensorrt_llm/common/cudaUtils.h" +#include "tensorrt_llm/common/logger.h" +#include "tensorrt_llm/common/opUtils.h" +#include "tensorrt_llm/runtime/utils/mpiUtils.h" + +#include +#include +#include +#include + +#if ENABLE_MULTI_DEVICE && BUILD_PYT +#include +#endif + +#if ENABLE_MULTI_DEVICE + +namespace mpi = tensorrt_llm::mpi; +namespace tr = tensorrt_llm::runtime; +namespace nccl_util = tensorrt_llm::common::nccl_util; + +using ::getComm; + +// Helper function to create a split communicator for testing +// This allows us to test cleanup behavior explicitly by controlling the lifetime +std::shared_ptr createSplitComm(ncclComm_t parentComm, int color, int key) +{ + ncclComm_t newComm; + ncclResult_t result = ncclCommSplit(parentComm, color, key, &newComm, nullptr); + if (result != ncclSuccess) + { + TLLM_THROW("ncclCommSplit failed with error: %d", result); + } + + // Create a shared_ptr with custom deleter that cleans up resources first + return std::shared_ptr(new ncclComm_t(newComm), + [](ncclComm_t* comm) + { + if (comm && *comm) + { + // STEP 1: Clean up all registered resources FIRST + tensorrt_llm::common::nccl_util::NcclCommResourceManager::getInstance().cleanupResources(*comm); + + // STEP 2: Now destroy the NCCL communicator + ncclResult_t result = ncclCommDestroy(*comm); + if (result != ncclSuccess) + { + TLLM_LOG_WARNING("ncclCommDestroy failed with error: %d", result); + } + + // STEP 3: Free the memory + delete comm; + } + }); +} + +//============================================================================== +// NcclCommResourceManager Tests +//============================================================================== + +class NcclCommResourceManagerTest : public ::testing::Test +{ +protected: + void SetUp() override + { + auto& comm = mpi::MpiComm::world(); + mWorldSize = comm.getSize(); + mRank = comm.getRank(); + + if (mWorldSize < 2) + { + GTEST_SKIP() << "Requires at least 2 ranks (got " << mWorldSize << ")"; + } + + // Set CUDA device for this rank (required before NCCL initialization) + int deviceCount = 0; + TLLM_CUDA_CHECK(cudaGetDeviceCount(&deviceCount)); + if (deviceCount > 0) + { + int deviceId = mRank % deviceCount; + TLLM_CUDA_CHECK(cudaSetDevice(deviceId)); + } + + // Create a communicator for testing + std::set group; + for (int i = 0; i < mWorldSize; ++i) + { + group.insert(i); + } + mComm = getComm(group); + } + + void TearDown() override + { + // Communicator cleanup happens automatically via shared_ptr deleter + mComm.reset(); + } + + int mWorldSize; + int mRank; + std::shared_ptr mComm; +}; + +TEST_F(NcclCommResourceManagerTest, ResourceRegistration) +{ + auto& manager = nccl_util::NcclCommResourceManager::getInstance(); + + // Create a separate comm using split for this test + auto testComm = createSplitComm(*mComm, 0, mRank); + + // Register a resource + bool cleanupCalled = false; + manager.registerResource( + *testComm, [&cleanupCalled]() { cleanupCalled = true; }, "TestResource"); + + EXPECT_TRUE(manager.hasResources(*testComm)); + EXPECT_EQ(manager.getResourceCount(*testComm), 1); + EXPECT_FALSE(cleanupCalled); // Cleanup not called yet + + // Store the raw comm value before destruction + ncclComm_t rawComm = *testComm; + + // Cleanup should be called when comm is destroyed + testComm.reset(); + + // Verify cleanup was called + EXPECT_TRUE(cleanupCalled); + + // Verify cleanup: check that the old comm (now destroyed) no longer has resources + // Note: The comm is destroyed, but we can still check the manager's internal state + // The cleanup should have removed all resources for this comm + EXPECT_FALSE(manager.hasResources(rawComm)); + EXPECT_EQ(manager.getResourceCount(rawComm), 0); +} + +TEST_F(NcclCommResourceManagerTest, MultipleResources) +{ + auto& manager = nccl_util::NcclCommResourceManager::getInstance(); + + // Create a separate comm using split for this test + auto testComm = createSplitComm(*mComm, 0, mRank); + + std::vector cleanupOrder; + manager.registerResource( + *testComm, [&cleanupOrder]() { cleanupOrder.push_back(1); }, "Resource1"); + manager.registerResource( + *testComm, [&cleanupOrder]() { cleanupOrder.push_back(2); }, "Resource2"); + manager.registerResource( + *testComm, [&cleanupOrder]() { cleanupOrder.push_back(3); }, "Resource3"); + + EXPECT_EQ(manager.getResourceCount(*testComm), 3); + + // Cleanup order should be preserved - destroy comm and verify order + testComm.reset(); + + // Verify cleanup order was preserved (1, 2, 3) + EXPECT_EQ(cleanupOrder.size(), 3); + EXPECT_EQ(cleanupOrder[0], 1); + EXPECT_EQ(cleanupOrder[1], 2); + EXPECT_EQ(cleanupOrder[2], 3); +} + +TEST_F(NcclCommResourceManagerTest, ResourceCount) +{ + auto& manager = nccl_util::NcclCommResourceManager::getInstance(); + + // Create a separate comm using split for this test + auto testComm = createSplitComm(*mComm, 0, mRank); + + EXPECT_FALSE(manager.hasResources(*testComm)); + EXPECT_EQ(manager.getResourceCount(*testComm), 0); + + manager.registerResource( + *testComm, []() {}, "Test1"); + EXPECT_EQ(manager.getResourceCount(*testComm), 1); + + manager.registerResource( + *testComm, []() {}, "Test2"); + EXPECT_EQ(manager.getResourceCount(*testComm), 2); + + testComm.reset(); +} + +//============================================================================== +// NCCLWindowAllocator Tests +//============================================================================== + +class NCCLWindowAllocatorTest : public ::testing::Test +{ +protected: + void SetUp() override + { + auto& comm = mpi::MpiComm::world(); + mWorldSize = comm.getSize(); + mRank = comm.getRank(); + + if (mWorldSize < 2) + { + GTEST_SKIP() << "Requires at least 2 ranks (got " << mWorldSize << ")"; + } + + // Set CUDA device for this rank (required before NCCL initialization) + int deviceCount = 0; + TLLM_CUDA_CHECK(cudaGetDeviceCount(&deviceCount)); + if (deviceCount > 0) + { + int deviceId = mRank % deviceCount; + TLLM_CUDA_CHECK(cudaSetDevice(deviceId)); + } + + // Check if NCCL symmetric is supported + auto& ncclHelper = nccl_util::NCCLHelper::getInstance(); + if (!ncclHelper.isLoaded()) + { + GTEST_SKIP() << "NCCL library with symmetric memory support is not available"; + } + + std::set group; + for (int i = 0; i < mWorldSize; ++i) + { + group.insert(i); + } + mComm = getComm(group); + } + + void TearDown() override + { + // Cleanup happens automatically + mComm.reset(); + } + + int mWorldSize; + int mRank; + std::shared_ptr mComm; +}; + +TEST_F(NCCLWindowAllocatorTest, BasicAllocation) +{ + auto& allocator = nccl_util::NCCLWindowAllocator::getInstance(); + + const size_t bufferSize = 1024 * 1024; // 1MB + auto buffer = allocator.requestBuffer(*mComm, bufferSize); + + EXPECT_TRUE(buffer.isValid()); + EXPECT_NE(buffer.ptr, nullptr); + EXPECT_NE(buffer.window, nullptr); + EXPECT_EQ(buffer.size, bufferSize); + EXPECT_GE(buffer.handle, 0); + + // Verify we can search for it + auto found = allocator.searchBuffer(*mComm, buffer.ptr); + EXPECT_TRUE(found.isValid()); + EXPECT_EQ(found.ptr, buffer.ptr); + + // Release the buffer + allocator.releaseBuffer(*mComm, buffer.ptr); +} + +TEST_F(NCCLWindowAllocatorTest, BufferReuse) +{ + auto& allocator = nccl_util::NCCLWindowAllocator::getInstance(); + + const size_t bufferSize = 512 * 1024; // 512KB + + // Allocate first buffer + auto buffer1 = allocator.requestBuffer(*mComm, bufferSize); + EXPECT_TRUE(buffer1.isValid()); + void* ptr1 = buffer1.ptr; + + // Release it + allocator.releaseBuffer(*mComm, ptr1); + + // Request another buffer of the same size - should reuse + auto buffer2 = allocator.requestBuffer(*mComm, bufferSize); + EXPECT_TRUE(buffer2.isValid()); + EXPECT_EQ(buffer2.ptr, ptr1); // Should be the same buffer + + allocator.releaseBuffer(*mComm, buffer2.ptr); +} + +TEST_F(NCCLWindowAllocatorTest, BestFitReuse) +{ + auto& allocator = nccl_util::NCCLWindowAllocator::getInstance(); + + // Allocate buffers of different sizes + auto buffer1MB = allocator.requestBuffer(*mComm, 1024 * 1024); + auto buffer2MB = allocator.requestBuffer(*mComm, 2 * 1024 * 1024); + auto buffer512KB = allocator.requestBuffer(*mComm, 512 * 1024); + + void* ptr1MB = buffer1MB.ptr; + void* ptr2MB = buffer2MB.ptr; + void* ptr512KB = buffer512KB.ptr; + + // Release all + allocator.releaseBuffer(*mComm, ptr1MB); + allocator.releaseBuffer(*mComm, ptr2MB); + allocator.releaseBuffer(*mComm, ptr512KB); + + // Request 768KB - should reuse 1MB (best fit, smallest that fits) + auto buffer768KB = allocator.requestBuffer(*mComm, 768 * 1024); + EXPECT_TRUE(buffer768KB.isValid()); + EXPECT_EQ(buffer768KB.ptr, ptr1MB); // Should reuse 1MB buffer + EXPECT_EQ(buffer768KB.size, 1024 * 1024); // Original size + + allocator.releaseBuffer(*mComm, buffer768KB.ptr); +} + +TEST_F(NCCLWindowAllocatorTest, MultipleBuffers) +{ + auto& allocator = nccl_util::NCCLWindowAllocator::getInstance(); + + const size_t bufferSize = 256 * 1024; + std::vector ptrs; + + // Allocate multiple buffers + for (int i = 0; i < 5; ++i) + { + auto buffer = allocator.requestBuffer(*mComm, bufferSize); + EXPECT_TRUE(buffer.isValid()); + ptrs.push_back(buffer.ptr); + } + + EXPECT_EQ(allocator.getBufferCount(*mComm), 5); + EXPECT_EQ(allocator.getBufferInUseCount(*mComm), 5); + + // Release all + for (auto* ptr : ptrs) + { + allocator.releaseBuffer(*mComm, ptr); + } + + EXPECT_EQ(allocator.getBufferInUseCount(*mComm), 0); + EXPECT_EQ(allocator.getBufferCount(*mComm), 5); // Buffers still exist, just not in use +} + +TEST_F(NCCLWindowAllocatorTest, SearchBuffer) +{ + auto& allocator = nccl_util::NCCLWindowAllocator::getInstance(); + + const size_t bufferSize = 128 * 1024; + auto buffer = allocator.requestBuffer(*mComm, bufferSize); + + // Test searchBuffer + auto found = allocator.searchBuffer(*mComm, buffer.ptr); + EXPECT_TRUE(found.isValid()); + EXPECT_EQ(found.ptr, buffer.ptr); + // Compare against actual allocated size (ncclMemAlloc may allocate more than requested) + EXPECT_EQ(found.size, buffer.size); + EXPECT_GE(found.size, bufferSize); // At least the requested size + + // Test search for non-existent buffer + void* fakePtr = reinterpret_cast(0xDEADBEEF); + auto notFound = allocator.searchBuffer(*mComm, fakePtr); + EXPECT_FALSE(notFound.isValid()); + + allocator.releaseBuffer(*mComm, buffer.ptr); +} + +TEST_F(NCCLWindowAllocatorTest, GetWindowAndSize) +{ + auto& allocator = nccl_util::NCCLWindowAllocator::getInstance(); + + const size_t bufferSize = 64 * 1024; + auto buffer = allocator.requestBuffer(*mComm, bufferSize); + + // Test getWindow + auto window = allocator.getWindow(*mComm, buffer.ptr); + EXPECT_NE(window, nullptr); + EXPECT_EQ(window, buffer.window); + + // Test getSize - compare against actual allocated size (ncclMemAlloc may allocate more than requested) + auto size = allocator.getSize(*mComm, buffer.ptr); + EXPECT_EQ(size, buffer.size); + EXPECT_GE(size, bufferSize); // At least the requested size + + // Test with invalid pointer + void* fakePtr = reinterpret_cast(0xDEADBEEF); + EXPECT_EQ(allocator.getWindow(*mComm, fakePtr), nullptr); + EXPECT_EQ(allocator.getSize(*mComm, fakePtr), 0); + + allocator.releaseBuffer(*mComm, buffer.ptr); +} + +TEST_F(NCCLWindowAllocatorTest, GetBufferInfo) +{ + auto& allocator = nccl_util::NCCLWindowAllocator::getInstance(); + + const size_t bufferSize = 32 * 1024; + auto buffer = allocator.requestBuffer(*mComm, bufferSize); + + auto info = allocator.getBufferInfo(*mComm, buffer.ptr); + EXPECT_TRUE(info.isValid()); + EXPECT_EQ(info.ptr, buffer.ptr); + EXPECT_EQ(info.size, buffer.size); + EXPECT_EQ(info.handle, buffer.handle); + EXPECT_EQ(info.window, buffer.window); + + allocator.releaseBuffer(*mComm, buffer.ptr); +} + +TEST_F(NCCLWindowAllocatorTest, ScopedBuffer) +{ + const size_t bufferSize = 16 * 1024; + + { + nccl_util::ScopedNCCLWindowBuffer scopedBuffer(*mComm, bufferSize); + EXPECT_TRUE(scopedBuffer.getBuffer().isValid()); + EXPECT_NE(scopedBuffer.getPtr(), nullptr); + // Compare against actual allocated size (ncclMemAlloc may allocate more than requested) + EXPECT_EQ(scopedBuffer.getSize(), scopedBuffer.getBuffer().size); + EXPECT_GE(scopedBuffer.getSize(), bufferSize); // At least the requested size + EXPECT_NE(scopedBuffer.getWindow(), nullptr); + + // Buffer should be in use + auto& allocator = nccl_util::NCCLWindowAllocator::getInstance(); + EXPECT_EQ(allocator.getBufferInUseCount(*mComm), 1); + } + + // Buffer should be released when scoped buffer goes out of scope + auto& allocator = nccl_util::NCCLWindowAllocator::getInstance(); + EXPECT_EQ(allocator.getBufferInUseCount(*mComm), 0); +} + +TEST_F(NCCLWindowAllocatorTest, CleanupOnCommDestroy) +{ + auto& allocator = nccl_util::NCCLWindowAllocator::getInstance(); + + // Create a separate comm using split for this test + auto testComm = createSplitComm(*mComm, 0, mRank); + + // Store the raw comm value before destruction + ncclComm_t rawComm = *testComm; + + // Allocate some buffers + const size_t bufferSize = 8 * 1024; + auto buffer1 = allocator.requestBuffer(*testComm, bufferSize); + auto buffer2 = allocator.requestBuffer(*testComm, bufferSize * 2); + + EXPECT_EQ(allocator.getBufferCount(*testComm), 2); + EXPECT_EQ(allocator.getBufferInUseCount(*testComm), 2); + + // Verify buffers are valid + EXPECT_TRUE(buffer1.isValid()); + EXPECT_TRUE(buffer2.isValid()); + + // Manually release buffers before cleanup to avoid warnings + allocator.releaseBuffer(*testComm, buffer1.ptr); + allocator.releaseBuffer(*testComm, buffer2.ptr); + + // Verify buffers are released but still exist in pool + EXPECT_EQ(allocator.getBufferInUseCount(*testComm), 0); + EXPECT_EQ(allocator.getBufferCount(*testComm), 2); // Buffers still exist, just not in use + + // Destroy the communicator - buffers should be cleaned up automatically + testComm.reset(); + + // Verify cleanup: check that the old comm (now destroyed) no longer has buffers + // Note: The comm is destroyed, but we can still check the allocator's internal state + // The cleanup should have removed all buffers for this comm + EXPECT_EQ(allocator.getBufferCount(rawComm), 0); + EXPECT_EQ(allocator.getBufferInUseCount(rawComm), 0); + // Note: isCommValid only checks for null, not cleaned-up state, because NCCL can reuse addresses + // The real check is that buffers are gone, which we verify above +} + +TEST_F(NCCLWindowAllocatorTest, CommValidity) +{ + auto& allocator = nccl_util::NCCLWindowAllocator::getInstance(); + + // Valid comm should be valid + EXPECT_TRUE(allocator.isCommValid(*mComm)); + + // Null comm should be invalid + EXPECT_FALSE(allocator.isCommValid(nullptr)); +} + +//============================================================================== +// Integration Tests +//============================================================================== + +TEST_F(NCCLWindowAllocatorTest, MultipleComms) +{ + auto& allocator = nccl_util::NCCLWindowAllocator::getInstance(); + + // Create two different communicators using split (different colors) + auto comm1 = createSplitComm(*mComm, 0, mRank); + auto comm2 = createSplitComm(*mComm, 1, mRank); + + const size_t bufferSize = 4 * 1024; + + // Allocate buffers from both comms + auto buffer1 = allocator.requestBuffer(*comm1, bufferSize); + auto buffer2 = allocator.requestBuffer(*comm2, bufferSize); + + EXPECT_TRUE(buffer1.isValid()); + EXPECT_TRUE(buffer2.isValid()); + + // Buffers should be tracked separately per comm + EXPECT_EQ(allocator.getBufferCount(*comm1), 1); + EXPECT_EQ(allocator.getBufferCount(*comm2), 1); + EXPECT_NE(buffer1.ptr, buffer2.ptr); // Different buffers from different comms + + allocator.releaseBuffer(*comm1, buffer1.ptr); + allocator.releaseBuffer(*comm2, buffer2.ptr); + + // Clean up comms + comm1.reset(); + comm2.reset(); +} + +#if ENABLE_MULTI_DEVICE && BUILD_PYT +//============================================================================== +// createNCCLWindowTensor Tests +//============================================================================== + +class CreateNCCLWindowTensorTest : public ::testing::Test +{ +protected: + void SetUp() override + { + auto& comm = mpi::MpiComm::world(); + mWorldSize = comm.getSize(); + mRank = comm.getRank(); + + if (mWorldSize < 2) + { + GTEST_SKIP() << "Requires at least 2 ranks (got " << mWorldSize << ")"; + } + + // Set CUDA device for this rank (required before NCCL initialization) + int deviceCount = 0; + TLLM_CUDA_CHECK(cudaGetDeviceCount(&deviceCount)); + if (deviceCount > 0) + { + int deviceId = mRank % deviceCount; + TLLM_CUDA_CHECK(cudaSetDevice(deviceId)); + } + + // Check if NCCL symmetric is supported + auto& ncclHelper = nccl_util::NCCLHelper::getInstance(); + if (!ncclHelper.isLoaded()) + { + GTEST_SKIP() << "NCCL library with symmetric memory support is not available"; + } + + std::set group; + for (int i = 0; i < mWorldSize; ++i) + { + group.insert(i); + } + mComm = getComm(group); + } + + void TearDown() override + { + mComm.reset(); + } + + int mWorldSize; + int mRank; + std::shared_ptr mComm; +}; + +TEST_F(CreateNCCLWindowTensorTest, BasicTensorCreation) +{ + using nccl_util::createNCCLWindowTensor; + + // Create a tensor with shape [4, 8] and float32 dtype + std::vector shape = {4, 8}; + auto [tensor, buffer] = createNCCLWindowTensor(*mComm, shape, torch::kFloat32); + + // Verify tensor properties + EXPECT_TRUE(tensor.defined()); + EXPECT_EQ(tensor.dtype(), torch::kFloat32); + EXPECT_EQ(tensor.device().type(), torch::kCUDA); + EXPECT_EQ(tensor.dim(), 2); + EXPECT_EQ(tensor.size(0), 4); + EXPECT_EQ(tensor.size(1), 8); + EXPECT_EQ(tensor.numel(), 4 * 8); + + // Verify buffer properties + EXPECT_TRUE(buffer.isValid()); + EXPECT_NE(buffer.ptr, nullptr); + // ncclMemAlloc may allocate more than requested, so check at least the requested size + EXPECT_GE(buffer.size, 4 * 8 * sizeof(float)); + EXPECT_NE(buffer.window, nullptr); + + // Verify tensor data pointer matches buffer pointer + EXPECT_EQ(tensor.data_ptr(), buffer.ptr); + + // Tensor should be in use + auto& allocator = nccl_util::NCCLWindowAllocator::getInstance(); + EXPECT_EQ(allocator.getBufferInUseCount(*mComm), 1); +} + +TEST_F(CreateNCCLWindowTensorTest, DifferentDtypes) +{ + using nccl_util::createNCCLWindowTensor; + + std::vector shape = {10}; + + // Test float32 + { + auto [tensor, buffer] = createNCCLWindowTensor(*mComm, shape, torch::kFloat32); + EXPECT_EQ(tensor.dtype(), torch::kFloat32); + // ncclMemAlloc may allocate more than requested, so check at least the requested size + EXPECT_GE(buffer.size, 10 * sizeof(float)); + EXPECT_EQ(tensor.data_ptr(), buffer.ptr); + } + + // Test float16 + { + auto [tensor, buffer] = createNCCLWindowTensor(*mComm, shape, torch::kFloat16); + EXPECT_EQ(tensor.dtype(), torch::kFloat16); + // ncclMemAlloc may allocate more than requested, so check at least the requested size + EXPECT_GE(buffer.size, 10 * sizeof(at::Half)); + EXPECT_EQ(tensor.data_ptr(), buffer.ptr); + } + + // Test int32 + { + auto [tensor, buffer] = createNCCLWindowTensor(*mComm, shape, torch::kInt32); + EXPECT_EQ(tensor.dtype(), torch::kInt32); + // ncclMemAlloc may allocate more than requested, so check at least the requested size + EXPECT_GE(buffer.size, 10 * sizeof(int32_t)); + EXPECT_EQ(tensor.data_ptr(), buffer.ptr); + } +} + +TEST_F(CreateNCCLWindowTensorTest, DifferentShapes) +{ + using nccl_util::createNCCLWindowTensor; + + // 1D tensor + { + std::vector shape = {100}; + auto [tensor, buffer] = createNCCLWindowTensor(*mComm, shape, torch::kFloat32); + EXPECT_EQ(tensor.dim(), 1); + EXPECT_EQ(tensor.size(0), 100); + // ncclMemAlloc may allocate more than requested, so check at least the requested size + EXPECT_GE(buffer.size, 100 * sizeof(float)); + } + + // 3D tensor + { + std::vector shape = {2, 3, 4}; + auto [tensor, buffer] = createNCCLWindowTensor(*mComm, shape, torch::kFloat32); + EXPECT_EQ(tensor.dim(), 3); + EXPECT_EQ(tensor.size(0), 2); + EXPECT_EQ(tensor.size(1), 3); + EXPECT_EQ(tensor.size(2), 4); + // ncclMemAlloc may allocate more than requested, so check at least the requested size + EXPECT_GE(buffer.size, 2 * 3 * 4 * sizeof(float)); + } + + // 4D tensor + { + std::vector shape = {1, 2, 3, 4}; + auto [tensor, buffer] = createNCCLWindowTensor(*mComm, shape, torch::kFloat32); + EXPECT_EQ(tensor.dim(), 4); + EXPECT_EQ(tensor.numel(), 1 * 2 * 3 * 4); + // ncclMemAlloc may allocate more than requested, so check at least the requested size + EXPECT_GE(buffer.size, 1 * 2 * 3 * 4 * sizeof(float)); + } +} + +TEST_F(CreateNCCLWindowTensorTest, TensorDeleterReleasesBuffer) +{ + using nccl_util::createNCCLWindowTensor; + + auto& allocator = nccl_util::NCCLWindowAllocator::getInstance(); + + { + std::vector shape = {16, 16}; + auto [tensor, buffer] = createNCCLWindowTensor(*mComm, shape, torch::kFloat32); + + EXPECT_EQ(allocator.getBufferInUseCount(*mComm), 1); + EXPECT_TRUE(buffer.isValid()); + void* bufferPtr = buffer.ptr; + + // Tensor goes out of scope - deleter should release the buffer + } + + // Buffer should be released (not in use anymore) + EXPECT_EQ(allocator.getBufferInUseCount(*mComm), 0); + + // Buffer should still exist in the pool (for reuse) + EXPECT_GE(allocator.getBufferCount(*mComm), 1); +} + +TEST_F(CreateNCCLWindowTensorTest, MultipleTensors) +{ + using nccl_util::createNCCLWindowTensor; + + auto& allocator = nccl_util::NCCLWindowAllocator::getInstance(); + + std::vector shape = {8, 8}; + auto [tensor1, buffer1] = createNCCLWindowTensor(*mComm, shape, torch::kFloat32); + auto [tensor2, buffer2] = createNCCLWindowTensor(*mComm, shape, torch::kFloat32); + auto [tensor3, buffer3] = createNCCLWindowTensor(*mComm, shape, torch::kFloat32); + + EXPECT_EQ(allocator.getBufferInUseCount(*mComm), 3); + EXPECT_NE(buffer1.ptr, buffer2.ptr); + EXPECT_NE(buffer2.ptr, buffer3.ptr); + EXPECT_NE(buffer1.ptr, buffer3.ptr); + + // All tensors should be valid + EXPECT_TRUE(tensor1.defined()); + EXPECT_TRUE(tensor2.defined()); + EXPECT_TRUE(tensor3.defined()); +} + +TEST_F(CreateNCCLWindowTensorTest, TensorStrides) +{ + using nccl_util::createNCCLWindowTensor; + + std::vector shape = {3, 4, 5}; + auto [tensor, buffer] = createNCCLWindowTensor(*mComm, shape, torch::kFloat32); + + // Verify strides are correct (row-major order) + EXPECT_EQ(tensor.stride(0), 4 * 5); // stride for first dimension + EXPECT_EQ(tensor.stride(1), 5); // stride for second dimension + EXPECT_EQ(tensor.stride(2), 1); // stride for third dimension +} + +#endif // ENABLE_MULTI_DEVICE && BUILD_PYT + +#endif // ENABLE_MULTI_DEVICE diff --git a/tensorrt_llm/_torch/pyexecutor/model_engine.py b/tensorrt_llm/_torch/pyexecutor/model_engine.py index 811f11fce5..aaac2256c9 100644 --- a/tensorrt_llm/_torch/pyexecutor/model_engine.py +++ b/tensorrt_llm/_torch/pyexecutor/model_engine.py @@ -2844,11 +2844,17 @@ class PyTorchModelEngine(ModelEngine): # Disable UB for unsupported platforms if not ub.ub_supported(): return False - use_nccl_symmetric = self.llm_args.allreduce_strategy == "NCCL_SYMMETRIC" - ub.initialize_userbuffers_manager( - self.mapping.tp_size, self.mapping.pp_size, self.mapping.cp_size, - self.mapping.rank, self.mapping.gpus_per_node, - hidden_size * self.max_num_tokens * 2, use_nccl_symmetric) + # NCCL_SYMMETRIC strategy no longer requires UserBuffer allocator initialization. + # It uses NCCLWindowAllocator from ncclUtils directly. + if self.llm_args.allreduce_strategy == "NCCL_SYMMETRIC": + # Skip UB initialization for NCCL_SYMMETRIC - it uses NCCLWindowAllocator directly + return False + ub.initialize_userbuffers_manager(self.mapping.tp_size, + self.mapping.pp_size, + self.mapping.cp_size, + self.mapping.rank, + self.mapping.gpus_per_node, + hidden_size * self.max_num_tokens * 2) return True diff --git a/tensorrt_llm/functional.py b/tensorrt_llm/functional.py index 19e31d04ce..b4c986fd6a 100755 --- a/tensorrt_llm/functional.py +++ b/tensorrt_llm/functional.py @@ -4020,7 +4020,10 @@ def create_allreduce_plugin( pfc = trt.PluginFieldCollection(pfc) ar_plug = allreduce_plg_creator.create_plugin("allreduce", pfc) plug_inputs = [tensor] - if all_reduce_params.strategy != AllReduceStrategy.NCCL and all_reduce_params.strategy != AllReduceStrategy.UB: + if all_reduce_params.strategy not in { + AllReduceStrategy.NCCL, AllReduceStrategy.UB, + AllReduceStrategy.NCCL_SYMMETRIC + }: plug_inputs.append(workspace) if all_reduce_params.fusion_op != AllReduceFusionOp.NONE: if all_reduce_params.has_bias() == 1: @@ -4092,7 +4095,7 @@ def allreduce( workspace = None if all_reduce_params.strategy != AllReduceStrategy.NCCL and all_reduce_params.strategy != AllReduceStrategy.UB: if current_all_reduce_helper().workspace is None: - all_reduce_params.strategy = AllReduceStrategy.NCCL + all_reduce_params.strategy = AllReduceStrategy.NCCL_SYMMETRIC else: workspace = current_all_reduce_helper().workspace.trt_tensor if all_reduce_params.strategy == AllReduceStrategy.UB: diff --git a/tests/integration/defs/cpp/test_multi_gpu.py b/tests/integration/defs/cpp/test_multi_gpu.py index 3b384dd58e..7cf92efaad 100644 --- a/tests/integration/defs/cpp/test_multi_gpu.py +++ b/tests/integration/defs/cpp/test_multi_gpu.py @@ -127,6 +127,24 @@ def run_user_buffer_tests(build_dir: _pl.Path, nprocs=2, timeout=300): timeout=timeout) +def run_nccl_utils_tests(build_dir: _pl.Path, nprocs=2, timeout=300): + tests_dir = build_dir / "tests" / "unit_tests" / "multi_gpu" + mgpu_env = get_multi_gpu_env() + + nccl_utils_test = [ + "mpirun", + "-n", + f"{nprocs}", + "--allow-run-as-root", + "ncclUtilsTest", + ] + + _cpp.run_command(nccl_utils_test, + cwd=tests_dir, + env=mgpu_env, + timeout=timeout) + + def run_llama_executor_leader_tests(build_dir: _pl.Path, timeout=1500): tests_dir = build_dir / "tests" / "e2e_tests" @@ -505,6 +523,15 @@ def test_user_buffer(build_google_tests, nprocs, build_dir): run_user_buffer_tests(build_dir=build_dir, nprocs=nprocs, timeout=300) +@pytest.mark.parametrize("build_google_tests", ["80", "86", "89", "90"], + indirect=True) +@pytest.mark.parametrize("nprocs", [2, 8], ids=["2proc", "8proc"]) +def test_nccl_utils(build_google_tests, nprocs, build_dir): + + if platform.system() != "Windows": + run_nccl_utils_tests(build_dir=build_dir, nprocs=nprocs, timeout=300) + + @pytest.mark.parametrize("build_google_tests", ["80", "86", "89", "90"], indirect=True) @pytest.mark.parametrize("multi_gpu_model", ["t5"], indirect=True) diff --git a/tests/microbenchmarks/all_reduce.py b/tests/microbenchmarks/all_reduce.py index 837b034812..bd5ceb8826 100644 --- a/tests/microbenchmarks/all_reduce.py +++ b/tests/microbenchmarks/all_reduce.py @@ -176,6 +176,7 @@ def allreduce_benchmark( ] strategies = [ AllReduceStrategy.NCCL, + AllReduceStrategy.NCCL_SYMMETRIC, AllReduceStrategy.ONESHOT, AllReduceStrategy.TWOSHOT, AllReduceStrategy.AUTO, @@ -242,6 +243,9 @@ def allreduce_benchmark( # print the dataframe if mapping.rank == 0: pd.set_option('display.max_rows', None) + pd.set_option('display.max_columns', None) + pd.set_option('display.width', None) + pd.set_option('display.max_colwidth', None) print(df) # # save the dataframe to a csv file diff --git a/tests/scripts/allreduce_perf/allreduce_heuristic_code_gen.py b/tests/scripts/allreduce_perf/allreduce_heuristic_code_gen.py index 11c114e9cf..e7aeb994b6 100644 --- a/tests/scripts/allreduce_perf/allreduce_heuristic_code_gen.py +++ b/tests/scripts/allreduce_perf/allreduce_heuristic_code_gen.py @@ -28,6 +28,7 @@ class Constants: tp_size_list = [2, 4, 8] strategy_name_to_enum = { 'NCCL': 0, + 'NCCL_SYMMETRIC': 8, 'ONESHOT': 4, 'TWOSHOT': 5, } @@ -84,10 +85,10 @@ def generate_heuristic_look_up_table(df: pd.DataFrame) -> str: hidden_size_count = len(Constants.hidden_size_list) num_tokens_count = len(Constants.num_tokens_list) - # Initialize lookup table with default values (NCCL = 0) + # Initialize lookup table with default values (NCCL_SYMMETRIC = 8) strategy_table = np.full( (tp_size_count, fusion_count, hidden_size_count, num_tokens_count), - Constants.strategy_name_to_enum['NCCL'], + Constants.strategy_name_to_enum['NCCL_SYMMETRIC'], dtype=int) # Fill the lookup table with best strategies diff --git a/tests/unittest/_torch/multi_gpu/test_allreduce.py b/tests/unittest/_torch/multi_gpu/test_allreduce.py index c01fe9205c..5051998c5a 100644 --- a/tests/unittest/_torch/multi_gpu/test_allreduce.py +++ b/tests/unittest/_torch/multi_gpu/test_allreduce.py @@ -123,7 +123,7 @@ def run_allreduce_op(x: torch.Tensor, residual: torch.Tensor, hidden_size: int, dtype=dtype, mapping=mapping, tensor_parallel_mode=TensorParallelMode.ROW, - allreduce_strategy=AllReduceStrategy.NCCL, + allreduce_strategy=AllReduceStrategy.NCCL_SYMMETRIC, ).cuda() allreduce = AllReduce(mapping=mapping) norm = RMSNorm(hidden_size=hidden_size, eps=eps, dtype=dtype).cuda() diff --git a/tests/unittest/_torch/multi_gpu/test_mnnvl_allreduce.py b/tests/unittest/_torch/multi_gpu/test_mnnvl_allreduce.py index 56cf5a9562..524fed462e 100644 --- a/tests/unittest/_torch/multi_gpu/test_mnnvl_allreduce.py +++ b/tests/unittest/_torch/multi_gpu/test_mnnvl_allreduce.py @@ -108,7 +108,7 @@ def row_linear_residual_norm_fusion_forward( ub.initialize_userbuffers_manager( tensor_parallel_size, 1, 1, tensor_parallel_rank, torch.cuda.device_count(), - x_list[0].nelement() * x_list[0].element_size(), True) + x_list[0].nelement() * x_list[0].element_size()) elif strategy == AllReduceStrategy.MNNVL: os.environ["TLLM_TEST_MNNVL"] = "1" diff --git a/tests/unittest/_torch/multi_gpu/test_user_buffers.py b/tests/unittest/_torch/multi_gpu/test_user_buffers.py index c547c8a3e8..6de03d1908 100644 --- a/tests/unittest/_torch/multi_gpu/test_user_buffers.py +++ b/tests/unittest/_torch/multi_gpu/test_user_buffers.py @@ -43,8 +43,7 @@ def create_tp_mapping(tp_size, rank): def init_userbuffers_allocator(tp_size, rank, max_ub_size): ub.initialize_userbuffers_manager(tp_size, 1, 1, rank, - torch.cuda.device_count(), max_ub_size, - False) + torch.cuda.device_count(), max_ub_size) def create_userbuffers_tensor(shape, dtype): From 383178c00a94fa104155659f2668068061b275fd Mon Sep 17 00:00:00 2001 From: chenfeiz0326 Date: Mon, 8 Dec 2025 09:00:44 +0800 Subject: [PATCH 05/10] [TRTLLM-9000][feat] Add multi-node Perf Tests into CI (#8800) Signed-off-by: Chenfei Zhang --- jenkins/L0_Test.groovy | 14 +- jenkins/scripts/slurm_run.sh | 2 +- .../defs/perf/open_search_db_utils.py | 113 ++- tests/integration/defs/perf/test_perf.py | 952 +++++++++++++----- tests/integration/defs/perf/utils.py | 348 ++++++- .../test-db/l0_dgx_b200_perf_sanity.yml | 41 + .../test-db/l0_dgx_b300_perf_sanity.yml | 41 + .../l0_gb200_multi_gpus_perf_sanity.yml | 22 + .../l0_gb200_multi_nodes_perf_sanity.yml | 16 + .../test-db/perf_sanity_l0_dgx_b200.yml | 35 - .../test-db/perf_sanity_l0_dgx_b300.yml | 37 - tests/scripts/perf-sanity/README.md | 201 ++-- tests/scripts/perf-sanity/l0_dgx_b200.yaml | 325 +++++- tests/scripts/perf-sanity/l0_dgx_b300.yaml | 226 ++++- .../perf-sanity/l0_gb200_multi_gpus.yaml | 294 ++++++ .../perf-sanity/l0_gb200_multi_nodes.yaml | 71 ++ 16 files changed, 2119 insertions(+), 619 deletions(-) create mode 100644 tests/integration/test_lists/test-db/l0_dgx_b200_perf_sanity.yml create mode 100644 tests/integration/test_lists/test-db/l0_dgx_b300_perf_sanity.yml create mode 100644 tests/integration/test_lists/test-db/l0_gb200_multi_gpus_perf_sanity.yml create mode 100644 tests/integration/test_lists/test-db/l0_gb200_multi_nodes_perf_sanity.yml delete mode 100644 tests/integration/test_lists/test-db/perf_sanity_l0_dgx_b200.yml delete mode 100644 tests/integration/test_lists/test-db/perf_sanity_l0_dgx_b300.yml create mode 100644 tests/scripts/perf-sanity/l0_gb200_multi_gpus.yaml create mode 100644 tests/scripts/perf-sanity/l0_gb200_multi_nodes.yaml diff --git a/jenkins/L0_Test.groovy b/jenkins/L0_Test.groovy index 41c66a7887..26c7716ba8 100644 --- a/jenkins/L0_Test.groovy +++ b/jenkins/L0_Test.groovy @@ -1126,7 +1126,7 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, perfMode=false, stageName="Undefined", splitId=1, splits=1, gpuCount=1, nodeCount=1, runWithSbatch=false, skipInstallWheel=false, cpver="cp312") { echo "Run Slurm job with native sbatch: $runWithSbatch" - if(nodeCount > 1 || runWithSbatch) { + if (nodeCount > 1 || runWithSbatch) { runLLMTestlistWithSbatch(pipeline, platform, testList, config, perfMode, stageName, splitId, splits, gpuCount, nodeCount, skipInstallWheel, cpver) } else { runLLMTestlistWithAgent(pipeline, platform, testList, config, perfMode, stageName, splitId, splits, gpuCount, skipInstallWheel, cpver) @@ -2493,7 +2493,7 @@ def runLLMTestlistOnPlatformImpl(pipeline, platform, testList, config=VANILLA_CO error "Some tests still failed after rerun attempts, please check the test report." } - if (perfMode) { + if (perfMode && !stageName.contains("Perf-Sanity")) { basePerfFilename = stageName.contains("PyTorch") ? "base_perf_pytorch.csv" : "base_perf.csv" basePerfPath = "${llmSrc}/tests/integration/defs/perf/${basePerfFilename}" stage("Check perf result") { @@ -2909,9 +2909,9 @@ def launchTestJobs(pipeline, testFilter) "DGX_B300-4_GPUs-PyTorch-Post-Merge-1": ["b300-x4", "l0_dgx_b300", 1, 2, 4], "DGX_B300-4_GPUs-PyTorch-Post-Merge-2": ["b300-x4", "l0_dgx_b300", 2, 2, 4], // Perf sanity post merge test - // Disable perf stages due to https://nvbugs/5643646 - // "DGX_B200-4_GPUs-PyTorch-Perf-Sanity-Post-Merge-1": ["b200-x4", "perf_sanity_l0_dgx_b200", 1, 1, 4], - // "DGX_B300-4_GPUs-PyTorch-Perf-Sanity-Post-Merge-1": ["b300-x4", "perf_sanity_l0_dgx_b300", 1, 1, 4], + // "DGX_B200-4_GPUs-PyTorch-Perf-Sanity-Post-Merge-1": ["b200-x4", "l0_dgx_b200_perf_sanity", 1, 1, 4], + // "DGX_B200-8_GPUs-PyTorch-Perf-Sanity-Post-Merge-1": ["b200-x8", "l0_dgx_b200_perf_sanity", 1, 1, 8], + // "DGX_B300-4_GPUs-PyTorch-Perf-Sanity-Post-Merge-1": ["b300-x4", "l0_dgx_b300_perf_sanity", 1, 1, 4], ] fullSet += x86SlurmTestConfigs.keySet() @@ -2939,6 +2939,8 @@ def launchTestJobs(pipeline, testFilter) "GB200-4_GPUs-PyTorch-1": ["gb200-x4-oci", "l0_gb200_multi_gpus", 1, 2, 4], "GB200-4_GPUs-PyTorch-2": ["gb200-x4-oci", "l0_gb200_multi_gpus", 2, 2, 4], "GB200-4_GPUs-PyTorch-Post-Merge-1": ["gb200-x4-oci", "l0_gb200_multi_gpus", 1, 1, 4], + // Perf sanity post merge test + "GB200-4_GPUs-PyTorch-Perf-Sanity-Post-Merge-1": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 1, 1, 4], // Disable GB300 stages due to nodes will be offline temporarily. // "GB300-PyTorch-1": ["gb300-single", "l0_gb300", 1, 1], // "GB300-4_GPUs-PyTorch-Post-Merge-1": ["gb300-x4", "l0_gb300_multi_gpus", 1, 1, 4], @@ -2953,6 +2955,8 @@ def launchTestJobs(pipeline, testFilter) "GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-1": ["gb200-oci-trtllm", "l0_gb200_multi_nodes", 1, 3, 8, 2], "GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-2": ["gb200-oci-trtllm", "l0_gb200_multi_nodes", 2, 3, 8, 2], "GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-3": ["gb200-oci-trtllm", "l0_gb200_multi_nodes", 3, 3, 8, 2], + // Perf sanity post merge test + "GB200-8_GPUs-2_Nodes-PyTorch-Perf-Sanity-Post-Merge-1": ["gb200-oci-trtllm", "l0_gb200_multi_nodes_perf_sanity", 1, 1, 8, 2], ] fullSet += multiNodesSBSAConfigs.keySet() diff --git a/jenkins/scripts/slurm_run.sh b/jenkins/scripts/slurm_run.sh index 8f191b3edb..717f1be791 100755 --- a/jenkins/scripts/slurm_run.sh +++ b/jenkins/scripts/slurm_run.sh @@ -109,7 +109,7 @@ echo "Full Command: $pytestCommand" eval $pytestCommand echo "Rank${SLURM_PROCID} Pytest finished execution" -if [ "$perfMode" = "true" ]; then +if [ $SLURM_PROCID -eq 0 ] && [ "$perfMode" = "true" ] && [[ "$stageName" != *Perf-Sanity* ]]; then if [[ "$stageName" == *PyTorch* ]]; then basePerfFilename="base_perf_pytorch.csv" else diff --git a/tests/integration/defs/perf/open_search_db_utils.py b/tests/integration/defs/perf/open_search_db_utils.py index 9f9ebda169..434af387a5 100644 --- a/tests/integration/defs/perf/open_search_db_utils.py +++ b/tests/integration/defs/perf/open_search_db_utils.py @@ -20,6 +20,7 @@ import os import re import sys import time +from datetime import datetime from defs.trt_test_alternative import print_info @@ -32,40 +33,6 @@ from jenkins.scripts.open_search_db import OpenSearchDB PROJECT_ROOT = "sandbox-temp-trtllm-ci-perf-v1" # "sandbox-trtllm-ci-perf" TEST_INFO_PROJECT_NAME = f"{PROJECT_ROOT}-test_info" -# Server config fields to compare -SERVER_FIELDS = [ - "s_model_name", - "l_gpus", - "l_tp", - "l_ep", - "l_pp", - "l_max_num_tokens", - "b_enable_chunked_prefill", - "b_disable_overlap_scheduler", - "s_attention_backend", - "s_moe_backend", - "l_moe_max_num_tokens", - "l_stream_interval", - "b_enable_attention_dp", - "b_attention_dp_balance", - "l_batching_wait_iters", - "l_timeout_iters", - "s_kv_cache_dtype", - "b_enable_block_reuse", - "d_free_gpu_memory_fraction", - "l_max_batch_size", - "b_enable_padding", -] - -# Client config fields to compare -CLIENT_FIELDS = [ - "l_concurrency", - "l_iterations", - "l_isl", - "l_osl", - "d_random_range_ratio", -] - # Metrics where larger is better MAXIMIZE_METRICS = [ "d_seq_throughput", @@ -137,6 +104,7 @@ def get_job_info(): trigger_mr_link = "" trigger_mr_id = "" trigger_mr_commit = "" + artifact_url = "" if is_pr_job: # Get PR info from github_pr_api_url github_pr_api_url = global_vars.get("github_pr_api_url", "") @@ -162,6 +130,9 @@ def get_job_info(): # Set trigger_mr_commit to commit trigger_mr_commit = commit + artifact_url = f"https://urm.nvidia.com/artifactory/sw-tensorrt-generic/llm-artifacts/LLM/main/L0_PostMerge/{job_id}" if job_id else "" + else: + artifact_url = f"https://urm.nvidia.com/artifactory/sw-tensorrt-generic/llm-artifacts/LLM/main/L0_PostMerge/{job_id}" if job_id else "" return { "b_is_baseline": False, @@ -185,11 +156,12 @@ def get_job_info(): "s_trigger_mr_link": trigger_mr_link, "s_trigger_mr_id": trigger_mr_id, "s_trigger_mr_commit": trigger_mr_commit, + "s_artifact_url": artifact_url, "b_is_regression": False, } -def query_history_data(): +def query_history_data(gpu_type): """ Query post-merge data with specific gpu type and model name """ @@ -209,6 +181,16 @@ def query_history_data(): "b_is_post_merge": True } }, + { + "term": { + "b_is_regression": False + } + }, + { + "term": { + "s_gpu_type": gpu_type + } + }, { "range": { "ts_created": { @@ -263,30 +245,38 @@ def query_history_data(): return [] -def match(history_data, new_data): +def match(history_data, new_data, match_keys): """ Check if the server and client config of history data matches the new data """ - # Combine all fields to compare (excluding log links) - fields_to_compare = SERVER_FIELDS + CLIENT_FIELDS def is_empty(value): - """Check if a value is empty (None, empty string, etc.)""" return value is None or value == "" - # Compare each field - for field in fields_to_compare: - history_value = history_data.get(field) - new_value = new_data.get(field) + def should_skip_field(field): + # Skip fields starting with @, _, ts_ + if field.startswith('@') or field.startswith('_') or field.startswith( + 'ts_'): + return True + # Skip log links and speculative_model_dir and job configs + if field in [ + 's_speculative_model_dir', 's_server_log_link', + 's_ctx_server_log_link', 's_gen_server_log_link', + 's_client_log_link' + ]: + return True + return False - # If both are empty, consider them equal + for field in match_keys: + # Skip excluded fields + if should_skip_field(field): + continue + history_value = history_data.get(field, None) + new_value = new_data.get(field, None) if is_empty(history_value) and is_empty(new_value): continue - - # If values don't match, return False if history_value != new_value: return False - return True @@ -339,27 +329,44 @@ def calculate_best_perf_result(history_data_list, new_data): return best_metrics -def get_history_data(new_data_dict): +def get_history_data(new_data_dict, gpu_type, match_keys): """ Query history post-merge data for each cmd_idx """ + + def get_latest_data(data_list): + if not data_list: + return None + time_format = "%b %d, %Y @ %H:%M:%S.%f" + # Find the item with the maximum ts_created value + latest_data = max( + data_list, + key=lambda x: datetime.strptime(x["ts_created"], time_format)) + return latest_data + history_baseline_dict = {} history_data_dict = {} cmd_idxs = new_data_dict.keys() for cmd_idx in cmd_idxs: history_data_dict[cmd_idx] = [] - history_baseline_dict[cmd_idx] = None - history_data_list = query_history_data() + history_baseline_dict[cmd_idx] = [] + history_data_list = [] + if cmd_idxs: + history_data_list = query_history_data(gpu_type) if history_data_list: for history_data in history_data_list: for cmd_idx in cmd_idxs: - if match(history_data, new_data_dict[cmd_idx]): + if match(history_data, new_data_dict[cmd_idx], match_keys): if history_data.get("b_is_baseline") and history_data.get( "b_is_baseline") == True: - history_baseline_dict[cmd_idx] = history_data + history_baseline_dict[cmd_idx].append(history_data) else: history_data_dict[cmd_idx].append(history_data) break + # Sometime database has several baselines and we only use the latest baseline one + for cmd_idx, baseline_list in history_baseline_dict.items(): + latest_baseline = get_latest_data(baseline_list) + history_baseline_dict[cmd_idx] = latest_baseline return history_baseline_dict, history_data_dict @@ -477,6 +484,8 @@ def post_new_perf_data(new_baseline_data_dict, new_data_dict, # Only post regressive test cases when post-merge. if new_baseline_data_dict: data_list.extend(regressive_data_list) + if not data_list: + return try: print_info( f"Ready to post {len(data_list)} data to {TEST_INFO_PROJECT_NAME}") diff --git a/tests/integration/defs/perf/test_perf.py b/tests/integration/defs/perf/test_perf.py index 942b3bd878..c8cd559e4d 100644 --- a/tests/integration/defs/perf/test_perf.py +++ b/tests/integration/defs/perf/test_perf.py @@ -18,6 +18,7 @@ TensorRT LLM perf tests import os import re import shutil +import socket import sys from typing import Dict, List, NamedTuple @@ -34,9 +35,10 @@ from .open_search_db_utils import (add_id, get_history_data, get_job_info, print_regressive_test_cases) from .pytorch_model_config import get_model_yaml_config from .sampler_options_config import get_sampler_options_config -from .utils import (AbstractPerfScriptTestClass, PerfBenchScriptTestCmds, - PerfDisaggScriptTestCmds, PerfMetricType, - PerfServerClientBenchmarkCmds, generate_test_nodes) +from .utils import (AbstractPerfScriptTestClass, PerfAggrScriptTestCmds, + PerfBenchScriptTestCmds, PerfDisaggScriptTestCmds, + PerfMetricType, PerfMultiNodeDisaggScriptTestCmds, + generate_test_nodes) if not hasattr(re, "Pattern"): re.Pattern = type(re.compile("")) @@ -103,6 +105,7 @@ MODEL_PATH_DICT = { "deepseek_r1_nvfp4": "DeepSeek-R1/DeepSeek-R1-FP4", "deepseek_r1_0528_fp8": "DeepSeek-R1/DeepSeek-R1-0528/", "deepseek_r1_0528_fp4": "DeepSeek-R1/DeepSeek-R1-0528-FP4/", + "deepseek_r1_0528_fp4_v2": "DeepSeek-R1/DeepSeek-R1-0528-FP4-v2/", "deepseek_v3_lite_fp8": "DeepSeek-V3-Lite/fp8", "deepseek_v3_lite_nvfp4": "DeepSeek-V3-Lite/nvfp4_moe_only", "qwen2_7b_instruct": "Qwen2-7B-Instruct", @@ -310,7 +313,7 @@ BENCH_PERF_METRIC_LOG_QUERIES = { r"Final KV cache size after resize: ([\d\.]+) GiB).*"), } -SERVER_BENCHMARK_PERF_METRIC_LOG_QUERIES = { +AGGR_SERVER_PERF_METRIC_LOG_QUERIES = { PerfMetricType.SEQ_THROUGHPUT: re.compile(r"Request throughput \(req\/s\):\s+([\d\.]+)"), PerfMetricType.TOKEN_THROUGHPUT: @@ -345,13 +348,6 @@ SERVER_BENCHMARK_PERF_METRIC_LOG_QUERIES = { re.compile(r"P99 E2EL \(ms\):\s+([\d\.]+)"), } -DISAGG_SERVER_METRICS_LOG_QUERIES = { - PerfMetricType.DISAGG_SERVER_E2EL: - re.compile(r"Median E2EL \(ms\):\s*(\d+\.?\d*)"), - PerfMetricType.DISAGG_SERVER_TTFT: - re.compile(r"Median TTFT \(ms\):\s*(\d+\.?\d*)"), -} - # (Relative threshold, Absolute threshold) for all metric types PERF_METRIC_THRESHOLD = { PerfMetricType.BUILD_TIME: (0.1, 30), # Ignore build time regression < 30ms @@ -443,7 +439,7 @@ INFERENCE_METRICS = [ PerfMetricType.CONTEXT_GPU_MEMORY, ] -SERVER_BENCHMARK_METRICS = [ +AGGR_SERVER_METRICS = [ PerfMetricType.SEQ_THROUGHPUT, PerfMetricType.TOKEN_THROUGHPUT, PerfMetricType.TOTAL_TOKEN_THROUGHPUT, @@ -496,138 +492,247 @@ class PerfTestMetric(NamedTuple): cmd_idx: int +def to_env_dict(env_vars: str) -> Dict[str, str]: + env = {} + for env_var in env_vars.split(): + if "=" in env_var: + key, value = env_var.split("=", 1) + env[key] = value + return env + + class ServerConfig: """ Configurations of trtllm-server. """ - def __init__( - self, - name: str, - model_name: str, - gpus: int, - tp: int, - ep: int, - max_num_tokens: int, - attention_backend: str, - max_batch_size: int, - pp: int = 1, - enable_chunked_prefill: bool = False, - disable_overlap_scheduler: bool = False, - moe_backend: str = "", - moe_max_num_tokens: int = 0, - stream_interval: int = 10, - enable_attention_dp: bool = False, - attention_dp_balance: bool = False, - batching_wait_iters: int = 10, - timeout_iters: int = 50, - kv_cache_dtype: str = "fp8", - enable_block_reuse: bool = False, - free_gpu_memory_fraction: float = 0.8, - enable_padding: bool = True, - ): - self.name = name - self.model_name = model_name - self.gpus = gpus - self.tp = tp - self.ep = ep - self.pp = pp - self.max_num_tokens = max_num_tokens - self.enable_chunked_prefill = enable_chunked_prefill - self.disable_overlap_scheduler = disable_overlap_scheduler - self.attention_backend = attention_backend - self.moe_backend = moe_backend - self.moe_max_num_tokens = moe_max_num_tokens - self.stream_interval = stream_interval - self.enable_attention_dp = enable_attention_dp - self.attention_dp_balance = attention_dp_balance - self.batching_wait_iters = batching_wait_iters - self.timeout_iters = timeout_iters - self.kv_cache_dtype = kv_cache_dtype - self.enable_block_reuse = enable_block_reuse - self.free_gpu_memory_fraction = free_gpu_memory_fraction - self.max_batch_size = max_batch_size - self.enable_padding = enable_padding - + def __init__(self, server_config_data: dict, env_vars: str = ""): + # Extract required fields + self.name = server_config_data['name'] + self.model_name = server_config_data['model_name'] + self.gpus = server_config_data['gpus'] self.model_path = "" + self.env_vars = env_vars - def to_cmd(self, working_dir: str) -> List[str]: + # Extract optional fields with defaults + self.tp = server_config_data.get('tensor_parallel_size', self.gpus) + self.ep = server_config_data.get('moe_expert_parallel_size', 1) + self.pp = server_config_data.get('pipeline_parallel_size', 1) + self.gpus_per_node = server_config_data.get('gpus_per_node', self.gpus) + self.max_num_tokens = server_config_data.get('max_num_tokens', 2048) + self.max_batch_size = server_config_data.get('max_batch_size', 512) + self.max_seq_len = server_config_data.get('max_seq_len', 0) + self.disable_overlap_scheduler = server_config_data.get( + 'disable_overlap_scheduler', False) + self.num_postprocess_workers = server_config_data.get( + 'num_postprocess_workers', 0) + self.stream_interval = server_config_data.get('stream_interval', 10) + self.attn_backend = server_config_data.get('attn_backend', "TRTLLM") + self.enable_chunked_prefill = server_config_data.get( + 'enable_chunked_prefill', False) + self.enable_attention_dp = server_config_data.get( + 'enable_attention_dp', False) + self.trust_remote_code = server_config_data.get('trust_remote_code', + False) + + # attention_dp_config + attention_dp_config = server_config_data.get('attention_dp_config', {}) + self.attention_dp_balance = attention_dp_config.get( + 'enable_balance', False) + self.batching_wait_iters = attention_dp_config.get( + 'batching_wait_iters', 0) + self.timeout_iters = attention_dp_config.get('timeout_iters', 60) + + # moe_config + moe_config = server_config_data.get('moe_config', {}) + self.moe_backend = moe_config.get('backend', "") + self.moe_max_num_tokens = moe_config.get('max_num_tokens', 0) + + # cuda_graph_config + cuda_graph_config = server_config_data.get('cuda_graph_config', {}) + self.enable_cuda_graph = False + if cuda_graph_config: + self.enable_cuda_graph = True + self.enable_padding = cuda_graph_config.get('enable_padding', True) + self.cuda_graph_batch_sizes = cuda_graph_config.get( + 'batch_sizes', []) + self.cuda_graph_max_batch_size = cuda_graph_config.get( + 'max_batch_size', 0) + else: + self.enable_padding = True + self.cuda_graph_batch_sizes = [] + self.cuda_graph_max_batch_size = 0 + + # kv_cache_config + kv_cache_config = server_config_data.get('kv_cache_config', {}) + self.kv_cache_dtype = kv_cache_config.get('dtype', "fp8") + self.enable_block_reuse = kv_cache_config.get('enable_block_reuse', + False) + self.free_gpu_memory_fraction = kv_cache_config.get( + 'free_gpu_memory_fraction', 0.8) + + # cache_transceiver_config + cache_transceiver_config = server_config_data.get( + 'cache_transceiver_config', {}) + self.cache_transceiver_backend = cache_transceiver_config.get( + 'backend', "") + self.cache_transceiver_max_tokens_in_buffer = cache_transceiver_config.get( + 'max_tokens_in_buffer', 0) + + # speculative_config + speculative_config = server_config_data.get('speculative_config', {}) + self.spec_decoding_type = speculative_config.get('decoding_type', "") + self.num_nextn_predict_layers = speculative_config.get( + 'num_nextn_predict_layers', 0) + eagle3_value = speculative_config.get('eagle3_layers_to_capture', []) + if isinstance(eagle3_value, int): + self.eagle3_layers_to_capture = [eagle3_value] + elif isinstance(eagle3_value, list): + self.eagle3_layers_to_capture = eagle3_value + else: + self.eagle3_layers_to_capture = [] + self.max_draft_len = speculative_config.get('max_draft_len', 0) + self.speculative_model_dir = speculative_config.get( + 'speculative_model_dir', "") + + # Store filtered config for extra_llm_api_config (exclude name, model_name, gpus, client_configs) + self.extra_llm_api_config_data = { + k: v + for k, v in server_config_data.items() + if k not in ['name', 'model_name', 'gpus', 'client_configs'] + } + + def to_cmd(self, + output_dir: str, + numa_bind: bool = False, + disagg_serving_type: str = "", + hostname: str = "localhost", + port: int = 8000) -> List[str]: model_dir = get_model_dir(self.model_name) self.model_path = model_dir if os.path.exists( model_dir) else self.model_name - config_path = os.path.join(working_dir, - f"extra-llm-api-config.{self.name}.yml") - return [ - "trtllm-serve", self.model_path, "--host", "localhost", "--port", - "8000", "--backend", "pytorch", "--extra_llm_api_options", + config_filename = f"extra-llm-api-config.{self.name}.yml" + if "CTX" in disagg_serving_type: + config_filename = f"extra-llm-api-config.{self.name}.ctx.yml" + elif "GEN" in disagg_serving_type: + config_filename = f"extra-llm-api-config.{self.name}.gen.yml" + config_path = os.path.join(output_dir, config_filename) + + numa_bind_cmd = [] + if numa_bind: + numa_bind_cmd = ["numactl", "-m 0,1"] + + cmd = numa_bind_cmd + [ + "trtllm-serve", self.model_path, "--host", hostname, "--port", + str(port), "--backend", "pytorch", "--extra_llm_api_options", config_path ] + return cmd + + def to_env(self) -> Dict[str, str]: + return to_env_dict(self.env_vars) def to_db_data(self) -> dict: - """Convert ServerConfig to Database data""" - return { - "s_model_name": self.model_name.lower(), - "l_gpus": self.gpus, - "l_tp": self.tp, - "l_ep": self.ep, - "l_pp": self.pp, - "l_max_num_tokens": self.max_num_tokens, - "b_enable_chunked_prefill": self.enable_chunked_prefill, - "b_disable_overlap_scheduler": self.disable_overlap_scheduler, - "s_attention_backend": self.attention_backend, - "s_moe_backend": self.moe_backend, - "l_moe_max_num_tokens": self.moe_max_num_tokens, - "l_stream_interval": self.stream_interval, - "b_enable_attention_dp": self.enable_attention_dp, - "b_attention_dp_balance": self.attention_dp_balance, - "l_batching_wait_iters": self.batching_wait_iters, - "l_timeout_iters": self.timeout_iters, - "s_kv_cache_dtype": self.kv_cache_dtype, - "b_enable_block_reuse": self.enable_block_reuse, - "d_free_gpu_memory_fraction": self.free_gpu_memory_fraction, - "l_max_batch_size": self.max_batch_size, - "b_enable_padding": self.enable_padding, - "s_server_log_link": "", + db_data = { + "s_model_name": + self.model_name.lower(), + "l_gpus": + self.gpus, + "l_tp": + self.tp, + "l_ep": + self.ep, + "l_pp": + self.pp, + "l_gpus_per_node": + self.gpus_per_node, + "l_max_num_tokens": + self.max_num_tokens, + "l_max_batch_size": + self.max_batch_size, + "l_max_seq_len": + self.max_seq_len, + "b_disable_overlap_scheduler": + self.disable_overlap_scheduler, + "l_num_postprocess_workers": + self.num_postprocess_workers, + "l_stream_interval": + self.stream_interval, + "s_attn_backend": + self.attn_backend, + "b_enable_chunked_prefill": + self.enable_chunked_prefill, + "b_enable_attention_dp": + self.enable_attention_dp, + "b_trust_remote_code": + self.trust_remote_code, + # attention_dp_config + "b_attention_dp_balance": + self.attention_dp_balance, + "l_batching_wait_iters": + self.batching_wait_iters, + "l_timeout_iters": + self.timeout_iters, + # moe_config + "s_moe_backend": + self.moe_backend, + "l_moe_max_num_tokens": + self.moe_max_num_tokens, + # cuda_graph_config + "b_enable_cuda_graph": + self.enable_cuda_graph, + "b_enable_padding": + self.enable_padding, + "l_cuda_graph_max_batch_size": + self.cuda_graph_max_batch_size, + "s_cuda_graph_batch_sizes": + ",".join(map(str, self.cuda_graph_batch_sizes)), + # kv_cache_config + "s_kv_cache_dtype": + self.kv_cache_dtype, + "b_enable_block_reuse": + self.enable_block_reuse, + "d_free_gpu_memory_fraction": + self.free_gpu_memory_fraction, + # cache_transceiver_config + "s_cache_transceiver_backend": + self.cache_transceiver_backend, + "l_cache_transceiver_max_tokens_in_buffer": + self.cache_transceiver_max_tokens_in_buffer, + # speculative_config + "s_spec_decoding_type": + self.spec_decoding_type, + "l_num_nextn_predict_layers": + self.num_nextn_predict_layers, + "s_eagle3_layers_to_capture": + ",".join(map(str, self.eagle3_layers_to_capture)), + "l_max_draft_len": + self.max_draft_len, + "s_speculative_model_dir": + self.speculative_model_dir, + "s_server_log_link": + "", + "s_server_env_var": + self.env_vars, } + return db_data def generate_extra_llm_api_config(self) -> str: """Generate extra-llm-api-config.yml content""" - config_lines = [ - f"tensor_parallel_size: {self.tp}", - f"moe_expert_parallel_size: {self.ep}", - f"pipeline_parallel_size: {self.pp}", - f"max_num_tokens: {self.max_num_tokens}", - f"enable_attention_dp: {str(self.enable_attention_dp).lower()}", - f"disable_overlap_scheduler: {str(self.disable_overlap_scheduler).lower()}", - f"stream_interval: {self.stream_interval}", - f"attn_backend: {self.attention_backend}", - f"enable_chunked_prefill: {str(self.enable_chunked_prefill).lower()}", - "cuda_graph_config:", - f" enable_padding: {str(self.enable_padding).lower()}", - f" max_batch_size: {self.max_batch_size}", - "kv_cache_config:", - f" dtype: {self.kv_cache_dtype}", - f" free_gpu_memory_fraction: {self.free_gpu_memory_fraction}", - f" enable_block_reuse: {str(self.enable_block_reuse).lower()}", - "print_iter_log: false", - ] + # Make a copy to avoid modifying the original + config_data = dict(self.extra_llm_api_config_data) - # Add moe_config if moe_backend is specified - if self.moe_backend: - config_lines.append("moe_config:") - config_lines.append(f" backend: {self.moe_backend}") - if self.moe_max_num_tokens: - config_lines.append( - f" max_num_tokens: {self.moe_max_num_tokens}") + # Handle speculative_model_dir path conversion if it exists + if 'speculative_config' in config_data and 'speculative_model_dir' in config_data[ + 'speculative_config']: + spec_model_dir = config_data['speculative_config'][ + 'speculative_model_dir'] + if spec_model_dir: + config_data['speculative_config'][ + 'speculative_model_dir'] = os.path.join( + llm_models_root(), spec_model_dir) - if self.attention_dp_balance: - config_lines.append("attention_dp_balance:") - config_lines.append(" enable_balance: true") - config_lines.append( - f" batching_wait_iters: {self.batching_wait_iters}") - config_lines.append(f" timeout_iters: {self.timeout_iters}") - - return "\n".join(config_lines) + return yaml.dump(config_data, default_flow_style=False, sort_keys=False) class ClientConfig: @@ -636,28 +741,30 @@ class ClientConfig: """ def __init__(self, - name: str, + client_config_data: dict, model_name: str, - concurrency: int, - iterations: int, - isl: int, - osl: int, - random_range_ratio: float = 0.0): - self.name = name + env_vars: str = ""): + self.name = client_config_data.get('name', '') self.model_name = model_name - self.concurrency = concurrency - self.iterations = iterations - self.isl = isl - self.osl = osl - self.random_range_ratio = random_range_ratio - + self.concurrency = client_config_data.get('concurrency', 1) + self.iterations = client_config_data.get('iterations', 1) + self.isl = client_config_data.get('isl', 1024) + self.osl = client_config_data.get('osl', 1024) + self.random_range_ratio = client_config_data.get( + 'random_range_ratio', 0.0) + self.backend = client_config_data.get('backend', "") + self.use_chat_template = client_config_data.get('use_chat_template', + False) + self.streaming = client_config_data.get('streaming', True) self.model_path = "" + self.env_vars = env_vars - def to_cmd(self, working_dir: str) -> List[str]: + def to_cmd(self, need_hostname: bool = True) -> List[str]: model_dir = get_model_dir(self.model_name) self.model_path = model_dir if os.path.exists( model_dir) else self.model_name - return [ + + benchmark_cmd = [ "python", "-m", "tensorrt_llm.serve.scripts.benchmark_serving", "--model", self.model_path, "--dataset-name", "random", "--random-ids", "--num-prompts", @@ -668,17 +775,40 @@ class ClientConfig: "--percentile-metrics", "ttft,tpot,itl,e2el", "--max-concurrency", str(self.concurrency) ] + if need_hostname: + hostname_port = ["--host", "localhost", "--port", "8000"] + benchmark_cmd.extend(hostname_port) + if self.backend: + benchmark_cmd.append("--backend") + benchmark_cmd.append(self.backend) + if self.use_chat_template: + benchmark_cmd.append("--use-chat-template") + if not self.streaming: + benchmark_cmd.append("--non-streaming") + return benchmark_cmd + + def to_env(self) -> Dict[str, str]: + return to_env_dict(self.env_vars) def to_db_data(self) -> dict: """Convert ClientConfig to Database data""" - return { + db_data = { "l_concurrency": self.concurrency, "l_iterations": self.iterations, "l_isl": self.isl, "l_osl": self.osl, "d_random_range_ratio": self.random_range_ratio, + "s_backend": self.backend, + "b_use_chat_template": self.use_chat_template, + "b_streaming": self.streaming, "s_client_log_link": "", + "s_client_env_vars": self.env_vars, } + if self.backend: + db_data["s_backend"] = self.backend + if self.use_chat_template: + db_data["b_use_chat_template"] = self.use_chat_template + return db_data def parse_select_pattern(select_pattern: str): @@ -720,8 +850,8 @@ def parse_select_pattern(select_pattern: str): return execution_plan -def parse_config_file(config_file_path: str, select_pattern: str = None): - """Parse YAML configuration file and create ServerConfig and ClientConfig objects +def parse_aggr_config_file(config_file_path: str, select_pattern: str = None): + """Parse YAML configuration file and create ServerConfig and ClientConfig objects for aggregated server Args: config_file_path: Path to YAML configuration file @@ -742,6 +872,16 @@ def parse_config_file(config_file_path: str, select_pattern: str = None): with open(config_file_path, 'r') as f: config = yaml.safe_load(f) + # Read environment config + environment = config.get('environment', {}) + if not environment: + environment = {} + + # Get environment variables + environment.get('worker_env_var', '') + server_env_var = environment.get('server_env_var', '') + client_env_var = environment.get('client_env_var', '') + server_configs = [] server_client_configs = {} @@ -752,39 +892,8 @@ def parse_config_file(config_file_path: str, select_pattern: str = None): if execution_plan is not None and server_name not in execution_plan: continue - # Create ServerConfig object - server_config = ServerConfig( - name=server_config_data['name'], - model_name=server_config_data['model_name'], - gpus=server_config_data['gpus'], - tp=server_config_data['tp'], - ep=server_config_data['ep'], - pp=server_config_data.get('pp', 1), - attention_backend=server_config_data.get('attention_backend', - 'TRTLLM'), - moe_backend=server_config_data.get('moe_backend', ''), - moe_max_num_tokens=server_config_data.get('moe_max_num_tokens', 0), - stream_interval=server_config_data.get('stream_interval', 10), - enable_attention_dp=server_config_data.get('enable_attention_dp', - False), - attention_dp_balance=server_config_data.get('attention_dp_balance', - False), - batching_wait_iters=server_config_data.get('batching_wait_iters', - 10), - timeout_iters=server_config_data.get('timeout_iters', 50), - enable_chunked_prefill=server_config_data.get( - 'enable_chunked_prefill', False), - max_num_tokens=server_config_data.get('max_num_tokens', 2048), - disable_overlap_scheduler=server_config_data.get( - 'disable_overlap_scheduler', False), - kv_cache_dtype=server_config_data.get('kv_cache_dtype', 'fp8'), - enable_block_reuse=server_config_data.get('enable_block_reuse', - False), - free_gpu_memory_fraction=server_config_data.get( - 'free_gpu_memory_fraction', 0.8), - max_batch_size=server_config_data.get('max_batch_size', 256), - enable_padding=server_config_data.get('enable_padding', True)) - + # Create ServerConfig object directly from dict + server_config = ServerConfig(server_config_data, server_env_var) server_id = len(server_configs) server_configs.append(server_config) @@ -802,15 +911,9 @@ def parse_config_file(config_file_path: str, select_pattern: str = None): if client_name not in selected_client_names: continue - client_config = ClientConfig( - name=client_config_data['name'], - model_name=server_config_data['model_name'], - concurrency=client_config_data['concurrency'], - iterations=client_config_data.get('iterations', 1), - isl=client_config_data.get('isl', 1024), - osl=client_config_data.get('osl', 1024), - random_range_ratio=client_config_data.get( - 'random_range_ratio', 0.0)) + client_config = ClientConfig(client_config_data, + server_config_data['model_name'], + client_env_var) client_configs.append(client_config) server_client_configs[server_id] = client_configs @@ -818,6 +921,87 @@ def parse_config_file(config_file_path: str, select_pattern: str = None): return execution_plan, server_configs, server_client_configs +def parse_multi_node_disagg_config_file(config_file_path: str, + select_pattern: str = None): + disagg_serving_type = os.environ.get("DISAGG_SERVING_TYPE", "BENCHMARK") + + # Read YAML config file + with open(config_file_path, 'r') as f: + config = yaml.safe_load(f) + + disagg_configs = [] + hardware = config.get('hardware', {}) + benchmark = config.get('benchmark', {}) + environment = config.get('environment', {}) + slurm_config = config.get('slurm', {}) + worker_config = config.get('worker_config', {}) + timeout = slurm_config.get('timeout', 3600) + numa_bind = slurm_config.get('numa_bind', False) + + # Get model name from environment + model_name = environment.get('model_name', '') + assert model_name, "model_name is required in environment section" + + # Get environment variables + worker_env_var = environment.get('worker_env_var', '') + server_env_var = environment.get('server_env_var', '') + client_env_var = environment.get('client_env_var', '') + + # Create ctx_server config data + ctx_server_config_data = { + 'name': 'ctx_server', + 'model_name': model_name, + 'gpus': hardware.get('gpus_per_ctx_server'), + 'gpus_per_node': hardware.get('gpus_per_node'), + **worker_config.get('ctx', {}) + } + + # Create gen_server config data + gen_server_config_data = { + 'name': 'gen_server', + 'model_name': model_name, + 'gpus': hardware.get('gpus_per_gen_server'), + 'gpus_per_node': hardware.get('gpus_per_node'), + **worker_config.get('gen', {}) + } + + # Create client config data + concurrency_str = benchmark.get('concurrency_list', '1') + concurrency = int(concurrency_str) if isinstance(concurrency_str, + str) else concurrency_str + + client_config_data = { + 'name': 'client', + 'concurrency': concurrency, + 'iterations': benchmark.get('multi_round', 1), + 'isl': benchmark.get('input_length', 1024), + 'osl': benchmark.get('output_length', 1024), + 'random_range_ratio': benchmark.get('benchmark_ratio', 0.0), + 'backend': 'openai', + 'use_chat_template': False, + 'streaming': benchmark.get('streaming', True), + } + + # Create disagg_config dict + disagg_config = { + 'disagg_serving_type': disagg_serving_type, + 'hostname': socket.gethostname(), + 'numa_bind': numa_bind, + 'timeout': timeout, + 'name': 'disagg_config', + 'model_name': model_name, + 'hardware': hardware, + 'ctx_server': ServerConfig(ctx_server_config_data, worker_env_var), + 'gen_server': ServerConfig(gen_server_config_data, worker_env_var), + 'server_env_var': server_env_var, + 'client': ClientConfig(client_config_data, model_name, client_env_var), + } + print_info(f"disagg_config: {disagg_config}") + disagg_configs.append(disagg_config) + + return disagg_configs + + class PerfTestConfig: """ Configurations defining the LLM perf test. @@ -928,15 +1112,16 @@ class PerfTestConfig: self.gen_server_workers = 0 # Used for perf sanity test - # config_file: YAML path, select_pattern: server/client selection string - # server_configs: list[ServerConfig], server_client_configs: dict[server_id -> list[ClientConfig]] self.upload_to_db = False self.config_file = None self.gpu_type = None self.config_path = None self.select_pattern = None + # Aggregated mode self.server_configs = [] self.server_client_configs = {} + # Multi-node disaggregated mode + self.disagg_configs = [] def _to_string_disagg(self, entries: List[str]): entries.append(f"disagg_server") @@ -965,10 +1150,16 @@ class PerfTestConfig: # Used for perf sanity test if self.config_file is not None: entries = ["perf_sanity", self.config_file] - if custom_server_name is not None: - entries.append(f"server:{custom_server_name}") - if custom_client_name is not None: - entries.append(f"client:{custom_client_name}") + if "disagg" in self.config_file: + # For multi-node disagg, add disagg config name + if custom_server_name is not None: + entries.append(f"disagg:{custom_server_name}") + else: + # For aggr_server + if custom_server_name is not None: + entries.append(f"server:{custom_server_name}") + if custom_client_name is not None: + entries.append(f"client:{custom_client_name}") return "-".join(entries) # First, add the model name. @@ -1140,15 +1331,33 @@ class PerfTestConfig: # Extract configs from test param labels. labels = test_param_labels.split("-") + def get_gpu_type(label: str) -> str: + parts = label.split("_") + if len(parts) < 2 or parts[0] != "l0": + return "" + if parts[1] == "dgx": + if len(parts) >= 3: + gpu_type = f"{parts[1]}_{parts[2]}" + else: + gpu_type = "" + else: + gpu_type = parts[1] + return gpu_type.lower() + # Used for perf sanity test if "perf_sanity" in labels[0]: assert len(labels) > 1, "perf_sanity test must have a config file!" - self.runtime = "server-benchmark" self.upload_to_db = "upload" in labels[0] self.config_file = labels[1] - self.gpu_type = labels[1].replace("l0_", "").lower() + if "disagg" in labels[1]: + self.runtime = "multi_node_disagg_server" + else: + self.runtime = "aggr_server" + self.gpu_type = get_gpu_type(labels[1]) + config_folder = os.getenv("TRTLLM_CONFIG_FOLDER", + "tests/scripts/perf-sanity") self.config_path = os.path.join( - "tests/scripts/perf-sanity", f"{labels[1]}.yaml" + config_folder, f"{labels[1]}.yaml" if not labels[1].endswith(".yaml") else labels[1]) self.select_pattern = labels[2] if len(labels) > 2 else None return @@ -1370,14 +1579,21 @@ class PerfTestConfig: [b >= 32 for b in self.batch_sizes] ), f"gpt_350m and bloom_560m with small BS are very unstable! Please increase to at least 32." - def set_server_client_configs(self, llm_root: str) -> None: + def set_aggr_server_configs(self, llm_root: str) -> None: """ Set the server and client configs. """ - if self.runtime == "server-benchmark": - config_file_path = os.path.join(llm_root, self.config_path) - _, self.server_configs, self.server_client_configs = parse_config_file( - config_file_path, self.select_pattern) + config_file_path = os.path.join(llm_root, self.config_path) + _, self.server_configs, self.server_client_configs = parse_aggr_config_file( + config_file_path, self.select_pattern) + + def set_multi_node_disagg_server_configs(self, llm_root: str) -> None: + """ + Set the multi-node disaggregated server configs. + """ + config_file_path = os.path.join(llm_root, self.config_path) + self.disagg_configs = parse_multi_node_disagg_config_file( + config_file_path, self.select_pattern) def get_model_family(self) -> str: """ @@ -1464,6 +1680,7 @@ class MultiMetricPerfTest(AbstractPerfScriptTestClass): def set_runtime_configs(self, llm_root, working_dir, + output_dir, perf_cache_fpath, gpu_clock_lock=None) -> None: if self._config.runtime == "cpp": @@ -1477,11 +1694,14 @@ class MultiMetricPerfTest(AbstractPerfScriptTestClass): llm_root) elif self._config.runtime == "bench": benchmark_script = "trtllm-bench" - elif self._config.runtime == "server-benchmark": + elif self._config.runtime == "aggr_server": benchmark_script = None - self._config.set_server_client_configs(llm_root) + self._config.set_aggr_server_configs(llm_root) elif self._config.runtime == "disagg_server": benchmark_script = None + elif self._config.runtime == "multi_node_disagg_server": + benchmark_script = None + self._config.set_multi_node_disagg_server_configs(llm_root) else: raise RuntimeError(f"Invalid runtime {self._config.runtime}.") @@ -1490,7 +1710,9 @@ class MultiMetricPerfTest(AbstractPerfScriptTestClass): if self._config.runtime == "bench": build_script = "trtllm-bench" - elif self._config.runtime == "server-benchmark": + elif self._config.runtime == "aggr_server": + build_script = None + elif self._config.runtime == "multi_node_disagg_server": build_script = None elif self._config.pp_size > 1 or self._config.model_name not in allowed_models: build_script = "trtllm-build" @@ -1502,31 +1724,114 @@ class MultiMetricPerfTest(AbstractPerfScriptTestClass): self._build_script = build_script self._benchmark_script = benchmark_script self._working_dir = working_dir + self._output_dir = output_dir self._perf_cache_fpath = perf_cache_fpath self._llm_root = llm_root self._gpu_clock_lock = gpu_clock_lock - def get_trtllm_server_client_commands(self): + def get_trtllm_aggr_commands(self, output_dir): server_cmds = [] + server_envs = [] client_cmds = [] + client_envs = [] names = [] for server_idx, client_configs in self._config.server_client_configs.items( ): server_config = self._config.server_configs[server_idx] - server_cmd = server_config.to_cmd(self._working_dir) - server_cmd = " ".join(server_cmd) + server_cmd = server_config.to_cmd(output_dir) + server_env = server_config.to_env() # Generate extra-llm-api-config.yml config_content = server_config.generate_extra_llm_api_config() config_filename = f"extra-llm-api-config.{server_config.name}.yml" - config_path = os.path.join(self._working_dir, config_filename) + config_path = os.path.join(output_dir, config_filename) with open(config_path, 'w') as f: f.write(config_content) for client_config in client_configs: server_cmds.append(server_cmd) - client_cmd = client_config.to_cmd(self._working_dir) + server_envs.append(server_env) + client_cmd = client_config.to_cmd(need_hostname=True) + client_env = client_config.to_env() client_cmds.append(client_cmd) + client_envs.append(client_env) names.append(f"{server_config.name}-{client_config.name}") - return server_cmds, client_cmds, names + return server_cmds, server_envs, client_cmds, client_envs, names + + def get_trtllm_multi_node_disagg_commands(self, output_dir): + ctx_server_cmds = [] + ctx_server_envs = [] + gen_server_cmds = [] + gen_server_envs = [] + disagg_server_cmds = [] + disagg_server_envs = [] + benchmark_cmds = [] + benchmark_envs = [] + # Create hostnames directory + hostnames_dir = os.path.join(output_dir, "hostnames") + if not os.path.exists(hostnames_dir): + os.makedirs(hostnames_dir, exist_ok=True) + + for disagg_config in self._config.disagg_configs: + disagg_serving_type = disagg_config['disagg_serving_type'] + hostname = disagg_config['hostname'] + numa_bind = disagg_config['numa_bind'] + ctx_server_cmd = None + ctx_server_env = None + gen_server_cmd = None + gen_server_env = None + disagg_server_cmd = None + disagg_server_env = None + benchmark_cmd = None + benchmark_env = None + if "CTX" in disagg_serving_type or "GEN" in disagg_serving_type: + # Write hostname to hostnames folder + hostname_file = os.path.join(hostnames_dir, + f"{disagg_serving_type}.txt") + with open(hostname_file, 'w') as f: + f.write(hostname) + # Generate CTX or GEN server commands if this is a CTX or GEN node + is_ctx = "CTX" in disagg_serving_type + server_config = disagg_config[ + 'ctx_server'] if is_ctx else disagg_config['gen_server'] + server_cmd = server_config.to_cmd(output_dir, numa_bind, + disagg_serving_type, hostname, + 8336) + server_env = server_config.to_env() + if is_ctx: + ctx_server_cmd = server_cmd + ctx_server_env = server_env + else: + gen_server_cmd = server_cmd + gen_server_env = server_env + # Generate extra-llm-api-config.yml + config_content = server_config.generate_extra_llm_api_config() + config_filename = f"extra-llm-api-config.{server_config.name}.{'ctx' if is_ctx else 'gen'}.yml" + config_path = os.path.join(output_dir, config_filename) + with open(config_path, 'w') as f: + f.write(config_content) + elif "DISAGG_SERVER" in disagg_serving_type: + timeout = disagg_config['timeout'] + # Generate DISAGG server command if this is the DISAGG server node + disagg_server_cmd = [ + "trtllm-serve", "disaggregated", "-c", + f"{output_dir}/server_config.yaml", "-t", + str(timeout), "-r", + str(timeout) + ] + disagg_server_env = to_env_dict(disagg_config['server_env_var']) + elif "BENCHMARK" in disagg_serving_type: + # Generate benchmark command if this is the BENCHMARK server node + benchmark_cmd = disagg_config['client'].to_cmd( + need_hostname=False) + benchmark_env = disagg_config['client'].to_env() + ctx_server_cmds.append(ctx_server_cmd) + ctx_server_envs.append(ctx_server_env) + gen_server_cmds.append(gen_server_cmd) + gen_server_envs.append(gen_server_env) + disagg_server_cmds.append(disagg_server_cmd) + disagg_server_envs.append(disagg_server_env) + benchmark_cmds.append(benchmark_cmd) + benchmark_envs.append(benchmark_env) + return ctx_server_cmds, ctx_server_envs, gen_server_cmds, gen_server_envs, disagg_server_cmds, disagg_server_envs, benchmark_cmds, benchmark_envs def get_trtllm_build_command(self, engine_dir, checkpoint_dir) -> list: build_cmd = [ @@ -1793,25 +2098,26 @@ class MultiMetricPerfTest(AbstractPerfScriptTestClass): return benchmark_cmd def get_commands(self): - # Whether this is python or cpp runtime perf test. is_python = self._config.runtime == "python" num_gpus = self._config.num_gpus - is_server_benchmark = self._config.runtime == "server-benchmark" + is_aggr = self._config.runtime == "aggr_server" is_disagg = self._config.runtime == "disagg_server" - - if is_server_benchmark: - perf_sanity_working_dir = os.path.join(self._working_dir, - "perf-sanity") - if not os.path.exists(perf_sanity_working_dir): - os.makedirs(perf_sanity_working_dir, exist_ok=True) - server_cmds, client_cmds, names = self.get_trtllm_server_client_commands( - ) - return PerfServerClientBenchmarkCmds( - server_cmds=server_cmds, - client_cmds=client_cmds, - names=names, - working_dir=perf_sanity_working_dir) + is_multi_node_disagg = self._config.runtime == "multi_node_disagg_server" + perf_sanity_output_dir = os.path.join(self._output_dir, + self._test_param_labels) + if is_aggr: + if not os.path.exists(perf_sanity_output_dir): + os.makedirs(perf_sanity_output_dir, exist_ok=True) + server_cmds, server_envs, client_cmds, client_envs, names = self.get_trtllm_aggr_commands( + perf_sanity_output_dir) + return PerfAggrScriptTestCmds(server_cmds=server_cmds, + server_envs=server_envs, + client_cmds=client_cmds, + client_envs=client_envs, + names=names, + timeout=3600, + output_dir=perf_sanity_output_dir) if is_disagg: ctx_cmd, gen_cmd = self._get_disagg_worker_deploy_command() @@ -1821,6 +2127,30 @@ class MultiMetricPerfTest(AbstractPerfScriptTestClass): return PerfDisaggScriptTestCmds(ctx_cmd, gen_cmd, server_cmd, client_cmd, benchmark_cmd) + if is_multi_node_disagg: + if not os.path.exists(perf_sanity_output_dir): + os.makedirs(perf_sanity_output_dir, exist_ok=True) + ctx_server_cmds, ctx_server_envs, gen_server_cmds, gen_server_envs, disagg_server_cmds, disagg_server_envs, benchmark_cmds, benchmark_envs = self.get_trtllm_multi_node_disagg_commands( + perf_sanity_output_dir) + return PerfMultiNodeDisaggScriptTestCmds( + ctx_server_cmds=ctx_server_cmds, + ctx_server_envs=ctx_server_envs, + gen_server_cmds=gen_server_cmds, + gen_server_envs=gen_server_envs, + disagg_server_cmds=disagg_server_cmds, + disagg_server_envs=disagg_server_envs, + benchmark_cmds=benchmark_cmds, + benchmark_envs=benchmark_envs, + timeout=self._config.disagg_configs[0]['timeout'], + hostname=self._config.disagg_configs[0]['hostname'], + disagg_serving_type=self._config.disagg_configs[0] + ['disagg_serving_type'], + num_ctx_servers=self._config.disagg_configs[0]['hardware'] + ['num_ctx_servers'], + num_gen_servers=self._config.disagg_configs[0]['hardware'] + ['num_gen_servers'], + output_dir=perf_sanity_output_dir) + if is_python and num_gpus > 1: # TODO: Fix https://nvbugs/4449875 pytest.skip( @@ -1976,7 +2306,6 @@ class MultiMetricPerfTest(AbstractPerfScriptTestClass): Run through the commands and parse multiple perf metrics from the logs. """ #print info to separate cases - print_info(f"Running perf test for case: {self._short_test_name}") self._current_cmd_idx = 0 metrics = self._get_metrics() outputs = {} @@ -2078,8 +2407,19 @@ class MultiMetricPerfTest(AbstractPerfScriptTestClass): """ Upload the test results and baseline to database. """ - # Currently only server-benchmark need to store the test result. - if self._config.runtime == "server-benchmark": + + def prefix_server_config_dict(config_dict: dict, + prefix_name: str) -> dict: + prefixed_dict = {} + for key, value in config_dict.items(): + type_prefix = key[0:2] # 'l_', 's_', 'b_', 'd_' + rest = key[2:] + prefixed_dict[f"{type_prefix}{prefix_name}_{rest}"] = value + return prefixed_dict + + match_keys = [] + # Only aggr_server and multi_node_disagg_server will upload. + if self._config.runtime == "aggr_server": job_config = get_job_info() job_config["s_gpu_type"] = self._config.gpu_type is_post_merge = job_config["b_is_post_merge"] @@ -2094,49 +2434,115 @@ class MultiMetricPerfTest(AbstractPerfScriptTestClass): # If cmd_idx not in self._test_results or some metrics missing, skip this cmd_idx if cmd_idx not in self._test_results or not all( metric_type in self._test_results[cmd_idx] - for metric_type in SERVER_BENCHMARK_METRICS): + for metric_type in AGGR_SERVER_METRICS): print_info( f"Skipped posting command {cmd_idx} 's test results since some metrics are missing in test results." ) cmd_idx += 1 continue - new_data = {} + new_data = { + "s_runtime": + "multi_node_aggr_server" if server_config.gpus + != server_config.gpus_per_node else "aggr_server" + } new_data.update(job_config) new_data.update(server_config_dict) new_data.update(client_config_dict) - for metric_type in SERVER_BENCHMARK_METRICS: + for metric_type in AGGR_SERVER_METRICS: new_data[ f"d_{PERF_METRIC_STRING[metric_type]}"] = self._test_results[ cmd_idx][metric_type] add_id(new_data) new_data_dict[cmd_idx] = new_data cmd_idx += 1 + if not match_keys: + match_keys.append("s_runtime") + match_keys.extend(server_config_dict.keys()) + match_keys.extend(client_config_dict.keys()) - # Get history data for each cmd_idx - history_baseline_dict, history_data_dict = get_history_data( - new_data_dict) - # Prepare regressive test cases - regressive_data_list = prepare_regressive_test_cases( - history_baseline_dict, new_data_dict) - - if is_post_merge: - # Prepare new baseline data for post-merge - new_baseline_data_dict = prepare_baseline_data( - history_baseline_dict, history_data_dict, new_data_dict) - else: - # Pre-merge does not need to upload baseline data - new_baseline_data_dict = None - - if self._config.upload_to_db: - # Upload the new perf data and baseline data to database - post_new_perf_data(new_baseline_data_dict, new_data_dict, - regressive_data_list) - - # Print regressive test cases - print_regressive_test_cases(regressive_data_list) + elif self._config.runtime == "multi_node_disagg_server": + if self._config.disagg_configs[0][ + 'disagg_serving_type'] != "BENCHMARK": + return + job_config = get_job_info() + job_config["s_gpu_type"] = self._config.gpu_type + is_post_merge = job_config["b_is_post_merge"] + new_data_dict = {} + cmd_idx = 0 + for disagg_config in self._config.disagg_configs: + # If cmd_idx not in self._test_results or some metrics missing, skip this cmd_idx + if cmd_idx not in self._test_results or not all( + metric_type in self._test_results[cmd_idx] + for metric_type in AGGR_SERVER_METRICS): + print_info( + f"Skipped posting command {cmd_idx} 's test results since some metrics are missing in test results." + ) + cmd_idx += 1 + continue + # Get ctx_server and gen_server configs with prefixed keys + ctx_server_config_dict = disagg_config['ctx_server'].to_db_data( + ) + gen_server_config_dict = disagg_config['gen_server'].to_db_data( + ) + ctx_server_config_dict = prefix_server_config_dict( + ctx_server_config_dict, 'ctx') + gen_server_config_dict = prefix_server_config_dict( + gen_server_config_dict, 'gen') + client_config_dict = disagg_config['client'].to_db_data() + # Build new_data + new_data = { + "s_runtime": "multi_node_disagg_server", + "s_server_env_var": disagg_config['server_env_var'] + } + new_data.update(job_config) + new_data.update(ctx_server_config_dict) + new_data.update(gen_server_config_dict) + new_data.update(client_config_dict) + # Add hardware information + hardware = disagg_config.get('hardware', {}) + new_data["l_num_ctx_servers"] = hardware.get( + 'num_ctx_servers', 0) + new_data["l_num_gen_servers"] = hardware.get( + 'num_gen_servers', 0) + # Add metrics from test results + for metric_type in AGGR_SERVER_METRICS: + new_data[ + f"d_{PERF_METRIC_STRING[metric_type]}"] = self._test_results[ + cmd_idx][metric_type] + add_id(new_data) + new_data_dict[cmd_idx] = new_data + cmd_idx += 1 + if not match_keys: + match_keys.extend( + ["s_runtime", "l_num_ctx_servers", "l_num_gen_servers"]) + match_keys.extend(ctx_server_config_dict.keys()) + match_keys.extend(gen_server_config_dict.keys()) + match_keys.extend(client_config_dict.keys()) else: return + # Get history data for each cmd_idx + history_baseline_dict, history_data_dict = get_history_data( + new_data_dict, self._config.gpu_type, match_keys) + # Prepare regressive test cases + regressive_data_list = prepare_regressive_test_cases( + history_baseline_dict, new_data_dict) + + if is_post_merge: + # Prepare new baseline data for post-merge + new_baseline_data_dict = prepare_baseline_data( + history_baseline_dict, history_data_dict, new_data_dict) + else: + # Pre-merge does not need to upload baseline data + new_baseline_data_dict = None + + if self._config.upload_to_db: + # Upload the new perf data and baseline data to database + post_new_perf_data(new_baseline_data_dict, new_data_dict, + regressive_data_list) + + print_regressive_test_cases(regressive_data_list) + def _get_engine_dir(self) -> str: """ Get the engine directory to store the engine. @@ -2150,13 +2556,13 @@ class MultiMetricPerfTest(AbstractPerfScriptTestClass): Generate all the metric configs for the current test. """ metrics = [] - if self._config.runtime == "server-benchmark": + if self._config.runtime == "aggr_server": cmd_idx = 0 for server_idx, client_configs in self._config.server_client_configs.items( ): server_name = self._config.server_configs[server_idx].name for client_config in client_configs: - for metric_type in SERVER_BENCHMARK_METRICS: + for metric_type in AGGR_SERVER_METRICS: metrics.append( PerfTestMetric( original_test_name=self._full_test_name, @@ -2193,6 +2599,28 @@ class MultiMetricPerfTest(AbstractPerfScriptTestClass): )) return metrics + if self._config.runtime == "multi_node_disagg_server": + cmd_idx = 0 + for disagg_config in self._config.disagg_configs: + config_name = disagg_config['name'] + for metric_type in AGGR_SERVER_METRICS: + metrics.append( + PerfTestMetric( + original_test_name=self._full_test_name, + metric_name=self._get_metric_name( + metric_type=metric_type, + disagg_config_name=config_name), + metric_type=metric_type, + metric_regex=self._get_metric_regex(metric_type), + metric_threshold=self._get_metric_threshold( + metric_type), + metric_abs_threshold=self._get_metric_abs_threshold( + metric_type), + cmd_idx=cmd_idx, + )) + cmd_idx += 1 + return metrics + # Build command is the first command. cmd_idx = 0 if self._config.runtime != "bench" else 1 if self._config.runtime == "bench": @@ -2264,7 +2692,8 @@ class MultiMetricPerfTest(AbstractPerfScriptTestClass): input_len: int = None, output_len: int = None, server_name: str = None, - client_name: str = None) -> str: + client_name: str = None, + disagg_config_name: str = None) -> str: """ Construct the metric name for given metric_type, bs, input_len, and output_len. """ @@ -2278,11 +2707,14 @@ class MultiMetricPerfTest(AbstractPerfScriptTestClass): if metric_type in BUILDER_METRICS: # We build one engine for all benchmark runs, so add all bs and seq lens to the metric name. metric_label = self._config.to_string(device_subtype=device_subtype) - elif self._config.runtime == "server-benchmark": + elif self._config.runtime == "aggr_server": metric_label = self._config.to_string( custom_server_name=server_name, custom_client_name=client_name, ) + elif self._config.runtime == "multi_node_disagg_server": + metric_label = self._config.to_string( + custom_server_name=disagg_config_name) else: # Otherwise, generate per-bs and per-seqlen label. metric_label = self._config.to_string( @@ -2303,10 +2735,14 @@ class MultiMetricPerfTest(AbstractPerfScriptTestClass): if metric_type not in BENCH_PERF_METRIC_LOG_QUERIES: raise ValueError(f"Unexpected metric_type: {metric_type}") return BENCH_PERF_METRIC_LOG_QUERIES[metric_type] - elif self._config.runtime == "server-benchmark": - if metric_type not in SERVER_BENCHMARK_PERF_METRIC_LOG_QUERIES: + elif self._config.runtime == "aggr_server": + if metric_type not in AGGR_SERVER_PERF_METRIC_LOG_QUERIES: raise ValueError(f"Unexpected metric_type: {metric_type}") - return SERVER_BENCHMARK_PERF_METRIC_LOG_QUERIES[metric_type] + return AGGR_SERVER_PERF_METRIC_LOG_QUERIES[metric_type] + elif self._config.runtime == "multi_node_disagg_server": + if metric_type not in AGGR_SERVER_PERF_METRIC_LOG_QUERIES: + raise ValueError(f"Unexpected metric_type: {metric_type}") + return AGGR_SERVER_PERF_METRIC_LOG_QUERIES[metric_type] else: pytest.skip("only support trtllm-bench runtime for now") @@ -2491,7 +2927,7 @@ def run_perf_test(perf_case_name, trt_performance_cache_fpath, """ working_dir = llm_venv.get_working_directory() test_runner = MultiMetricPerfTest(perf_case_name) - test_runner.set_runtime_configs(llm_root, working_dir, + test_runner.set_runtime_configs(llm_root, working_dir, output_dir, trt_performance_cache_fpath, trt_gpu_clock_lock) test_runner.run_metrics(llm_venv, trt_gpu_clock_lock, diff --git a/tests/integration/defs/perf/utils.py b/tests/integration/defs/perf/utils.py index f6296e0b30..d3c38ddb2d 100644 --- a/tests/integration/defs/perf/utils.py +++ b/tests/integration/defs/perf/utils.py @@ -26,6 +26,7 @@ from pathlib import Path from typing import Dict, List, NamedTuple, Optional import requests +import yaml from _pytest.nodes import Item from _pytest.python import Function from defs.trt_test_alternative import (check_output, popen, print_error, @@ -235,54 +236,80 @@ class PerfBenchScriptTestCmds(NamedTuple): return cmd_str -class PerfServerClientBenchmarkCmds(NamedTuple): - server_cmds: List[str] +class PerfAggrScriptTestCmds(NamedTuple): + server_cmds: List[List[str]] + server_envs: List[Dict[str, str]] client_cmds: List[List[str]] + client_envs: List[Dict[str, str]] names: List[str] - working_dir: str + timeout: int + output_dir: str - def wait_for_endpoint_ready(self, url: str, timeout: int = 5400): + def wait_for_endpoint_ready(self, url: str, timeout: int = 7200): start = time.monotonic() - while time.monotonic() - start < timeout: + while True: + elapsed_time = time.monotonic() - start + if elapsed_time > timeout: + print_error( + f"Timeout waiting for endpoint {url} to be ready after {timeout} seconds" + ) + break try: - time.sleep(10) + print_info( + f"Waiting for endpoint {url} to be ready, elapsed time: {elapsed_time}s" + ) + time.sleep(1) if requests.get(url).status_code == 200: - print(f"endpoint {url} is ready") + print_info(f"endpoint {url} is ready") return except Exception as err: - print(f"endpoint {url} is not ready, with exception: {err}") + print_info( + f"endpoint {url} is not ready, with exception: {err}") print_error( f"Endpoint {url} did not become ready within {timeout} seconds") def run_cmd(self, cmd_idx: int, venv) -> str: output = "" + server_proc = None server_file_path = os.path.join( - self.working_dir, f"trtllm-serve.{self.names[cmd_idx]}.log") + self.output_dir, f"trtllm-serve.{self.names[cmd_idx]}.log") client_file_path = os.path.join( - self.working_dir, f"trtllm-benchmark.{self.names[cmd_idx]}.log") + self.output_dir, f"trtllm-benchmark.{self.names[cmd_idx]}.log") try: - with ( # Start server process - open(server_file_path, 'w') as server_ctx, - popen(self.server_cmds[cmd_idx], - stdout=server_ctx, - stderr=subprocess.STDOUT, - env=venv._new_env, - shell=True) as server_proc): - self.wait_for_endpoint_ready( - "http://localhost:8000/v1/models", - timeout=7200) # 120 minutes for large models - output += subprocess.check_output(self.client_cmds[cmd_idx], - env=venv._new_env).decode() - # Write output to client file path - with open(client_file_path, 'w') as client_ctx: - client_ctx.write(output) + server_envs = copy.deepcopy(os.environ) + # server_envs.update(self.server_envs[cmd_idx]) + print_info( + f"Starting server. cmd is {self.server_cmds[cmd_idx]} envs are {server_envs}" + ) + with open(server_file_path, 'w') as server_ctx: + server_proc = subprocess.Popen( + self.server_cmds[cmd_idx], + stdout=server_ctx, + stderr=subprocess.STDOUT, + env=server_envs, + ) + self.wait_for_endpoint_ready("http://localhost:8000/health", + timeout=self.timeout) + client_envs = copy.deepcopy(os.environ) + # client_envs.update(self.client_envs[cmd_idx]) + print_info( + f"Starting client. cmd is {self.client_cmds[cmd_idx]} envs are {client_envs}" + ) + output = subprocess.check_output( + self.client_cmds[cmd_idx], + env=client_envs, + stderr=subprocess.STDOUT, + ).decode() + + with open(client_file_path, 'w') as client_ctx: + client_ctx.write(output) finally: server_proc.terminate() server_proc.wait() return output def get_cmd_str(self, cmd_idx) -> List[str]: - return ["server-benchmark tests, please check config files"] + return ["aggr_server tests, please check config files"] class PerfDisaggScriptTestCmds(NamedTuple): @@ -347,6 +374,259 @@ class PerfDisaggScriptTestCmds(NamedTuple): return ["disaggregated server tests, please check config files"] +class PerfMultiNodeDisaggScriptTestCmds(NamedTuple): + ctx_server_cmds: List[List[str]] + ctx_server_envs: List[Dict[str, str]] + gen_server_cmds: List[List[str]] + gen_server_envs: List[Dict[str, str]] + disagg_server_cmds: List[List[str]] + disagg_server_envs: List[Dict[str, str]] + benchmark_cmds: List[List[str]] + benchmark_envs: List[Dict[str, str]] + timeout: int + hostname: str + disagg_serving_type: str + num_ctx_servers: int + num_gen_servers: int + output_dir: str + + def _generate_disagg_server_config(self, + cmd_idx: int, + ctx_gen_port: int = 8336, + disagg_server_port: int = 8333) -> str: + print_info( + f"Generating disagg server config for command index {cmd_idx}") + # Wait for all hostname files to be created + hostnames_folder = os.path.join(self.output_dir, "hostnames") + print_info(f"Waiting for hostnames folder: {hostnames_folder}") + + expected_count = self.num_ctx_servers + self.num_gen_servers + start_time = time.time() + hostnames = [] + while True: + elapsed_time = time.time() - start_time + print_info( + f"Waiting for hostnames in {hostnames_folder}, elapsed time: {elapsed_time}s, current: {len(hostnames)}, expected: {expected_count}" + ) + if elapsed_time > self.timeout: + print_error( + f"Time out. Hostnames files are not ready after {self.timeout}s" + ) + time.sleep(10) + if not os.path.exists(hostnames_folder): + continue + hostnames = os.listdir(hostnames_folder) + if len(hostnames) >= expected_count: + break + print_info( + f"All hostnames found in {hostnames_folder} after elapsed time: {elapsed_time}s" + ) + + # Read ctx and gen hostnames + ctx_hostnames = [] + gen_hostnames = [] + for hostname_file in hostnames: + hostname_file_path = os.path.join(hostnames_folder, hostname_file) + with open(hostname_file_path, 'r') as f: + actual_hostname = f.read().strip() + print_info(f"Hostname: {actual_hostname} in {hostname_file}") + if hostname_file.startswith("CTX"): + ctx_hostnames.append(actual_hostname) + elif hostname_file.startswith("GEN"): + gen_hostnames.append(actual_hostname) + print_info(f"ctx_hostnames: {ctx_hostnames}") + print_info(f"gen_hostnames: {gen_hostnames}") + + # Generate server config + server_config = { + 'hostname': self.hostname, + 'port': disagg_server_port, + 'backend': 'pytorch', + 'context_servers': { + 'num_instances': self.num_ctx_servers, + 'urls': [f'{host}:{ctx_gen_port}' for host in ctx_hostnames] + }, + 'generation_servers': { + 'num_instances': self.num_gen_servers, + 'urls': [f'{host}:{ctx_gen_port}' for host in gen_hostnames] + } + } + + config_path = os.path.join(self.output_dir, "server_config.yaml") + with open(config_path, 'w') as f: + yaml.dump(server_config, f) + print_info(f"Server config file {config_path} generated") + + return config_path + + def _get_disagg_server_hostname_and_port(self) -> tuple: + config_path = os.path.join(self.output_dir, "server_config.yaml") + print_info(f"Waiting for server config file: {config_path}") + start_time = time.time() + while True: + if os.path.exists(config_path): + print_info(f"Server config file found: {config_path}") + break + elapsed_time = time.time() - start_time + if elapsed_time > self.timeout: + print_error( + f"Server config file {config_path} not found after {self.timeout}s" + ) + print_info( + f"Waiting for server config file, elapsed time: {elapsed_time}s" + ) + time.sleep(10) # Check every 10 seconds + + # Read server config to get hostname and port + with open(config_path, 'r') as f: + server_config = yaml.safe_load(f) + disagg_server_hostname = server_config['hostname'] + disagg_server_port = str(server_config['port']) + return disagg_server_hostname, disagg_server_port + + def wait_for_benchmark_ready(self, + benchmark_status_file: str, + timeout: int = 7200): + print_info( + f"Server {self.disagg_serving_type} waiting for benchmark status file: {benchmark_status_file}" + ) + start_time = time.time() + while True: + if os.path.exists(benchmark_status_file): + print_info( + f"Benchmark status file found, terminating server {self.disagg_serving_type}" + ) + break + elapsed_time = time.time() - start_time + print_info( + f"Waiting for benchmark status file, elapsed time: {elapsed_time}s" + ) + if elapsed_time > timeout: + print_error( + f"Timeout waiting for benchmark status file after {timeout}s, terminating server {self.disagg_serving_type}" + ) + break + time.sleep(10) # Check every 10 seconds + + def wait_for_endpoint_ready(self, url: str, timeout: int = 7200): + start = time.monotonic() + while True: + elapsed_time = time.monotonic() - start + if elapsed_time > timeout: + print_error( + f"Timeout waiting for endpoint {url} to be ready after {timeout} seconds" + ) + break + print_info( + f"Waiting for endpoint {url} to be ready, elapsed time: {elapsed_time}s" + ) + try: + time.sleep(10) + if requests.get(url).status_code == 200: + print_info(f"endpoint {url} is ready") + return + except Exception as err: + print_info( + f"endpoint {url} is not ready, with exception: {err}") + print_error( + f"Endpoint {url} did not become ready within {timeout} seconds") + + def run_cmd(self, cmd_idx: int, venv) -> str: + output = "" + server_proc = None + benchmark_status_file = os.path.join(self.output_dir, + f"benchmark_status.{cmd_idx}.txt") + if "CTX" in self.disagg_serving_type or "GEN" in self.disagg_serving_type: + server_file_path = os.path.join( + self.output_dir, + f"trtllm-serve.{cmd_idx}.{self.disagg_serving_type}.log") + is_ctx = "CTX" in self.disagg_serving_type + server_cmd = self.ctx_server_cmds[ + cmd_idx] if is_ctx else self.gen_server_cmds[cmd_idx] + server_envs = copy.deepcopy(os.environ) + # server_envs.update(self.ctx_server_envs[cmd_idx] + # if is_ctx else self.gen_server_envs[cmd_idx]) + try: + print_info( + f"Starting server. disagg_serving_type: {self.disagg_serving_type} cmd is {server_cmd} envs are {server_envs}" + ) + with open(server_file_path, 'w') as server_ctx: + server_proc = subprocess.Popen( + server_cmd, + stdout=server_ctx, + stderr=subprocess.STDOUT, + env=server_envs, + ) + self.wait_for_benchmark_ready(benchmark_status_file, + timeout=self.timeout) + finally: + print_info(f"Server {self.disagg_serving_type} stopped") + server_proc.terminate() + server_proc.wait() + elif self.disagg_serving_type == "DISAGG_SERVER": + disagg_server_file_path = os.path.join( + self.output_dir, + f"trtllm-serve.{cmd_idx}.{self.disagg_serving_type}.log") + disagg_server_cmd = self.disagg_server_cmds[cmd_idx] + disagg_server_envs = copy.deepcopy(os.environ) + # disagg_server_envs.update(self.disagg_server_envs[cmd_idx]) + try: + # Generate disagg server config (this will wait for all hostnames) + self._generate_disagg_server_config(cmd_idx) + print_info( + f"Starting disagg server. disagg_serving_type: {self.disagg_serving_type} disagg server cmd is {disagg_server_cmd} envs are {disagg_server_envs}" + ) + with open(disagg_server_file_path, 'w') as disagg_server_ctx: + disagg_server_proc = subprocess.Popen( + disagg_server_cmd, + stdout=disagg_server_ctx, + stderr=subprocess.STDOUT, + env=disagg_server_envs, + ) + self.wait_for_benchmark_ready(benchmark_status_file, + timeout=self.timeout) + finally: + print_info(f"Disagg server {self.disagg_serving_type} stopped") + disagg_server_proc.terminate() + disagg_server_proc.wait() + elif self.disagg_serving_type == "BENCHMARK": + benchmark_file_path = os.path.join( + self.output_dir, f"trtllm-benchmark.{cmd_idx}.log") + try: + # Get disagg server's hostname and port + disagg_server_hostname, disagg_server_port = self._get_disagg_server_hostname_and_port( + ) + # Add hostname and port to benchmark command + benchmark_cmd = self.benchmark_cmds[cmd_idx] + [ + '--host', disagg_server_hostname, '--port', + disagg_server_port + ] + benchmark_envs = copy.deepcopy(os.environ) + # benchmark_envs.update(self.benchmark_envs[cmd_idx]) + self.wait_for_endpoint_ready( + f"http://{disagg_server_hostname}:{disagg_server_port}/health", + timeout=self.timeout, + ) + # Run benchmark + print_info( + f"Starting benchmark. disagg_serving_type: {self.disagg_serving_type} benchmark cmd is {benchmark_cmd} envs are {benchmark_envs}" + ) + output = subprocess.check_output( + benchmark_cmd, env=benchmark_envs, + stderr=subprocess.STDOUT).decode() + with open(benchmark_file_path, 'w') as benchmark_ctx: + benchmark_ctx.write(output) + finally: + with open(benchmark_status_file, 'w') as status_file: + status_file.write("Done") + return output + + def get_cmd_str(self, cmd_idx) -> List[str]: + return [ + "multi-node disaggregated server tests, please check config files" + ] + + class AbstractPerfScriptTestClass(abc.ABC): """ Abstract class for all script-based perf tests. @@ -453,6 +733,14 @@ class AbstractPerfScriptTestClass(abc.ABC): is_prepare_dataset_cmd = 'prepare_dataset' in commands.get_cmd_str( cmd_idx) + is_perf_sanity_test = "perf_sanity" in full_test_name + + is_disagg_server = False + if self._config.runtime == "multi_node_disagg_server": + disagg_serving_type = self._config.disagg_configs[0][ + 'disagg_serving_type'] + is_disagg_server = disagg_serving_type != "BENCHMARK" + # Start the timer. self._start_timestamp = datetime.utcnow() try: @@ -460,7 +748,8 @@ class AbstractPerfScriptTestClass(abc.ABC): # Capture the stdout from _gpu_clock_lock because the pipeline JUnit update script tries to parse # the log to find the GPU clocks. with io.StringIO() as buf: - if self._gpu_clock_lock: + # Perf-sanity test doesn't lock gpu clock + if self._gpu_clock_lock and not is_perf_sanity_test: # Lock GPU clock and start monitoring. with contextlib.redirect_stdout( buf), self._gpu_clock_lock, tmpDir: @@ -515,9 +804,12 @@ class AbstractPerfScriptTestClass(abc.ABC): # Parse the perf result from the test outputs. if is_prepare_dataset_cmd: print_info( - f"skip writing perf result when calling generating dataset in trtllm-bench" + f"skip writing perf result when calling generating dataset in trtllm-bench." ) outputs.pop(cmd_idx) + elif is_disagg_server: + print_info( + f"skip writing perf result when running disagg's server.") else: self._perf_result = self.get_perf_result(outputs) diff --git a/tests/integration/test_lists/test-db/l0_dgx_b200_perf_sanity.yml b/tests/integration/test_lists/test-db/l0_dgx_b200_perf_sanity.yml new file mode 100644 index 0000000000..d4470fe1a4 --- /dev/null +++ b/tests/integration/test_lists/test-db/l0_dgx_b200_perf_sanity.yml @@ -0,0 +1,41 @@ +version: 0.0.1 +l0_dgx_b200_perf_sanity: +- condition: + ranges: + system_gpu_count: + gte: 8 + lte: 8 + wildcards: + gpu: + - '*b200*' + linux_distribution_name: ubuntu* + cpu: x86_64 + terms: + stage: post_merge + backend: pytorch + orchestrator: mpi + tests: + - perf/test_perf.py::test_perf[perf_sanity_upload-l0_dgx_b200-r1_fp8_dep8_mtp1_1k1k] TIMEOUT (180) + - perf/test_perf.py::test_perf[perf_sanity_upload-l0_dgx_b200-r1_fp8_tep8_mtp3_1k1k] TIMEOUT (180) + - perf/test_perf.py::test_perf[perf_sanity_upload-l0_dgx_b200-r1_fp8_tp8_mtp3_1k1k] TIMEOUT (180) + +- condition: + ranges: + system_gpu_count: + gte: 4 + lte: 4 + wildcards: + gpu: + - '*b200*' + linux_distribution_name: ubuntu* + cpu: x86_64 + terms: + stage: post_merge + backend: pytorch + orchestrator: mpi + tests: + - perf/test_perf.py::test_perf[perf_sanity_upload-l0_dgx_b200-r1_fp4_v2_dep4_mtp1_1k1k] TIMEOUT (180) + - perf/test_perf.py::test_perf[perf_sanity_upload-l0_dgx_b200-r1_fp4_v2_tep4_mtp3_1k1k] TIMEOUT (180) + - perf/test_perf.py::test_perf[perf_sanity_upload-l0_dgx_b200-r1_fp4_v2_tp4_mtp3_1k1k] TIMEOUT (180) + - perf/test_perf.py::test_perf[perf_sanity_upload-l0_dgx_b200-gpt_oss_fp4_dep2,gpt_oss_fp4_dep4] TIMEOUT (180) + - perf/test_perf.py::test_perf[perf_sanity_upload-l0_dgx_b200-gpt_oss_fp4_tp4_eagle3] TIMEOUT (180) diff --git a/tests/integration/test_lists/test-db/l0_dgx_b300_perf_sanity.yml b/tests/integration/test_lists/test-db/l0_dgx_b300_perf_sanity.yml new file mode 100644 index 0000000000..ff0b9eafe3 --- /dev/null +++ b/tests/integration/test_lists/test-db/l0_dgx_b300_perf_sanity.yml @@ -0,0 +1,41 @@ +version: 0.0.1 +l0_dgx_b300_perf_sanity: +- condition: + ranges: + system_gpu_count: + gte: 8 + lte: 8 + wildcards: + gpu: + - '*gb110*' + - '*b300*' + linux_distribution_name: ubuntu* + cpu: x86_64 + terms: + stage: post_merge + backend: pytorch + orchestrator: mpi + tests: + - perf/test_perf.py::test_perf[perf_sanity_upload-l0_dgx_b200-r1_fp8_dep8_mtp1_1k1k] TIMEOUT (180) + - perf/test_perf.py::test_perf[perf_sanity_upload-l0_dgx_b200-r1_fp8_tep8_mtp3_1k1k] TIMEOUT (180) + - perf/test_perf.py::test_perf[perf_sanity_upload-l0_dgx_b200-r1_fp8_tp8_mtp3_1k1k] TIMEOUT (180) + +- condition: + ranges: + system_gpu_count: + gte: 4 + lte: 4 + wildcards: + gpu: + - '*gb110*' + - '*b300*' + linux_distribution_name: ubuntu* + cpu: x86_64 + terms: + stage: post_merge + backend: pytorch + orchestrator: mpi + tests: + - perf/test_perf.py::test_perf[perf_sanity_upload-l0_dgx_b200-r1_fp4_v2_dep4_mtp1_1k1k] TIMEOUT (180) + - perf/test_perf.py::test_perf[perf_sanity_upload-l0_dgx_b200-r1_fp4_v2_tep4_mtp3_1k1k] TIMEOUT (180) + - perf/test_perf.py::test_perf[perf_sanity_upload-l0_dgx_b200-r1_fp4_v2_tp4_mtp3_1k1k] TIMEOUT (180) diff --git a/tests/integration/test_lists/test-db/l0_gb200_multi_gpus_perf_sanity.yml b/tests/integration/test_lists/test-db/l0_gb200_multi_gpus_perf_sanity.yml new file mode 100644 index 0000000000..23f4b20f97 --- /dev/null +++ b/tests/integration/test_lists/test-db/l0_gb200_multi_gpus_perf_sanity.yml @@ -0,0 +1,22 @@ +version: 0.0.1 +l0_gb200_multi_gpus_perf_sanity: +- condition: + ranges: + system_gpu_count: + gte: 4 + lte: 4 + wildcards: + gpu: + - '*gb200*' + linux_distribution_name: ubuntu* + cpu: aarch64 + terms: + stage: post_merge + backend: pytorch + tests: + - perf/test_perf.py::test_perf[perf_sanity_upload-l0_gb200_multi_gpus-r1_fp4_v2_dep4_mtp1_1k1k] + - perf/test_perf.py::test_perf[perf_sanity_upload-l0_gb200_multi_gpus-r1_fp4_v2_tep4_mtp3_1k1k] + - perf/test_perf.py::test_perf[perf_sanity_upload-l0_gb200_multi_gpus-r1_fp4_v2_tp4_mtp3_1k1k] + - perf/test_perf.py::test_perf[perf_sanity_upload-l0_gb200_multi_gpus-r1_fp4_v2_dep4_mtp1_8k1k] + - perf/test_perf.py::test_perf[perf_sanity_upload-l0_gb200_multi_gpus-r1_fp4_v2_tep4_mtp3_8k1k] + - perf/test_perf.py::test_perf[perf_sanity_upload-l0_gb200_multi_gpus-r1_fp4_v2_tp4_mtp3_8k1k] diff --git a/tests/integration/test_lists/test-db/l0_gb200_multi_nodes_perf_sanity.yml b/tests/integration/test_lists/test-db/l0_gb200_multi_nodes_perf_sanity.yml new file mode 100644 index 0000000000..bc7d95b047 --- /dev/null +++ b/tests/integration/test_lists/test-db/l0_gb200_multi_nodes_perf_sanity.yml @@ -0,0 +1,16 @@ +version: 0.0.1 +l0_gb200_multi_nodes_perf_sanity: +- condition: + ranges: + # 2 nodes with each node has 4 GPUs + system_gpu_count: + gte: 8 + lte: 8 + wildcards: + gpu: + - '*gb200*' + terms: + stage: post_merge + backend: pytorch + tests: + - perf/test_perf.py::test_perf[perf_sanity_upload-l0_gb200_multi_nodes-r1_fp4_v2_dep8_mtp1] diff --git a/tests/integration/test_lists/test-db/perf_sanity_l0_dgx_b200.yml b/tests/integration/test_lists/test-db/perf_sanity_l0_dgx_b200.yml deleted file mode 100644 index 3fdd60670f..0000000000 --- a/tests/integration/test_lists/test-db/perf_sanity_l0_dgx_b200.yml +++ /dev/null @@ -1,35 +0,0 @@ -version: 0.0.1 -perf_sanity_l0_dgx_b200: -- condition: - ranges: - system_gpu_count: - gte: 4 - lte: 4 - wildcards: - gpu: - - '*b200*' - linux_distribution_name: ubuntu* - cpu: x86_64 - terms: - stage: pre_merge - backend: pytorch - orchestrator: mpi - tests: - - perf/test_perf.py::test_perf[perf_sanity_upload-l0_dgx_b200] - -- condition: - ranges: - system_gpu_count: - gte: 4 - lte: 4 - wildcards: - gpu: - - '*b200*' - linux_distribution_name: ubuntu* - cpu: x86_64 - terms: - stage: post_merge - backend: pytorch - orchestrator: mpi - tests: - - perf/test_perf.py::test_perf[perf_sanity_upload-l0_dgx_b200] diff --git a/tests/integration/test_lists/test-db/perf_sanity_l0_dgx_b300.yml b/tests/integration/test_lists/test-db/perf_sanity_l0_dgx_b300.yml deleted file mode 100644 index ef98b37ef9..0000000000 --- a/tests/integration/test_lists/test-db/perf_sanity_l0_dgx_b300.yml +++ /dev/null @@ -1,37 +0,0 @@ -version: 0.0.1 -perf_sanity_l0_dgx_b300: -- condition: - ranges: - system_gpu_count: - gte: 4 - lte: 4 - wildcards: - gpu: - - '*gb110*' - - '*b300*' - linux_distribution_name: ubuntu* - cpu: x86_64 - terms: - stage: pre_merge - backend: pytorch - orchestrator: mpi - tests: - - perf/test_perf.py::test_perf[perf_sanity_upload-l0_dgx_b300] - -- condition: - ranges: - system_gpu_count: - gte: 4 - lte: 4 - wildcards: - gpu: - - '*gb110*' - - '*b300*' - linux_distribution_name: ubuntu* - cpu: x86_64 - terms: - stage: post_merge - backend: pytorch - orchestrator: mpi - tests: - - perf/test_perf.py::test_perf[perf_sanity_upload-l0_dgx_b300] diff --git a/tests/scripts/perf-sanity/README.md b/tests/scripts/perf-sanity/README.md index ee928939c6..66f9a93fc6 100644 --- a/tests/scripts/perf-sanity/README.md +++ b/tests/scripts/perf-sanity/README.md @@ -1,134 +1,109 @@ -# TensorRT-LLM Benchmark Test System +# TensorRT-LLM Perf Sanity Test System -Benchmarking scripts for TensorRT-LLM serving performance tests with configuration-driven test cases and CSV report generation. +Performance sanity testing scripts for TensorRT-LLM with configuration-driven test cases supporting single-node, multi-node aggregated, and multi-node disaggregated architectures. ## Overview -- Run performance benchmarks across multiple model configurations +- Run performance sanity benchmarks across multiple model configurations +- Support three deployment architectures: single-node, multi-node aggregated, and multi-node disaggregated - Manage test cases through YAML configuration files -- Support selective execution of specific test cases +- Automated resource calculation and job submission via SLURM -## Scripts Overview +## Configuration File Types -### 1. `benchmark_config.yaml` - Test Case Configuration -**Purpose**: Defines all benchmark test cases in a structured YAML format. +There are three types of YAML configuration files for different deployment architectures: + +### 1. Single-Node Aggregated Test Configuration + +**File Example**: `l0_dgx_b200.yaml` + +**Use Case**: Single-node performance tests on a single server with multiple GPUs. **Structure**: ```yaml server_configs: - - name: "r1_fp4_dep4" - model_name: "deepseek_r1_0528_fp4" - tp: 4 - ep: 4 - pp: 1 + - name: "r1_fp8_dep8_mtp1_1k1k" + model_name: "deepseek_r1_0528_fp8" + gpus: 8 + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + max_batch_size: 512 + max_num_tokens: 8192 attention_backend: "TRTLLM" - moe_backend: "CUTLASS" - moe_max_num_tokens: "" enable_attention_dp: true - enable_chunked_prefill: false - max_num_tokens: 2176 - disable_overlap_scheduler: false - kv_cache_dtype: "fp8" - enable_block_reuse: false - free_gpu_memory_fraction: 0.8 - max_batch_size: 256 - enable_padding: true + attention_dp_config: + batching_wait_iters: 0 + enable_balance: true + timeout_iters: 60 + moe_config: + backend: 'DEEPGEMM' + cuda_graph_config: + enable_padding: true + max_batch_size: 512 + kv_cache_config: + dtype: 'fp8' + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + speculative_config: + decoding_type: 'MTP' + num_nextn_predict_layers: 1 client_configs: - - name: "con1_iter1_1024_1024" - concurrency: 1 - iterations: 1 + - name: "con4096_iter10_1k1k" + concurrency: 4096 + iterations: 10 isl: 1024 osl: 1024 - random_range_ratio: 0.0 - - name: "con8_iter1_1024_1024" - concurrency: 8 - iterations: 1 - isl: 1024 - osl: 1024 - random_range_ratio: 0.0 + random_range_ratio: 0.8 + backend: "openai" +``` - - name: "r1_fp4_tep4" - model_name: "deepseek_r1_0528_fp4" - tp: 4 - ep: 4 - pp: 1 - attention_backend: "TRTLLM" - moe_backend: "CUTLASS" - moe_max_num_tokens: "" - enable_attention_dp: false - enable_chunked_prefill: false - max_num_tokens: 2176 - disable_overlap_scheduler: false - kv_cache_dtype: "fp8" - enable_block_reuse: false - free_gpu_memory_fraction: 0.8 - max_batch_size: 256 - enable_padding: true + +### 2. Multi-Node Aggregated Test Configuration + +**File Example**: `l0_gb200_multi_nodes.yaml` + +**Use Case**: Multi-node aggregated architecture where model runs across multiple nodes with unified execution. + +**Structure**: +```yaml +# Hardware Config +hardware: + gpus_per_node: 4 + gpus_per_server: 8 + +server_configs: + - name: "r1_fp4_v2_dep8_mtp1" + model_name: "deepseek_r1_0528_fp4_v2" + gpus: 8 + gpus_per_node: 4 + trust_remote_code: true + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + max_batch_size: 512 + max_num_tokens: 2112 + attn_backend: "TRTLLM" + enable_attention_dp: true + attention_dp_config: + batching_wait_iters: 0 + enable_balance: true + timeout_iters: 60 + moe_config: + backend: 'CUTLASS' + cuda_graph_config: + enable_padding: true + max_batch_size: 512 + kv_cache_config: + dtype: 'fp8' + enable_block_reuse: false + free_gpu_memory_fraction: 0.5 client_configs: - - name: "con1_iter1_1024_1024" - concurrency: 1 - iterations: 1 + - name: "con32_iter12_1k1k" + concurrency: 32 + iterations: 12 isl: 1024 osl: 1024 - random_range_ratio: 0.0 - - name: "con8_iter1_1024_1024" - concurrency: 8 - iterations: 1 - isl: 1024 - osl: 1024 - random_range_ratio: 0.0 -``` - -### 2. `run_benchmark_serve.py` - Main Benchmark Runner -**Purpose**: Executes performance benchmarks based on YAML configuration files. - -**Usage**: -```bash -python run_benchmark_serve.py --log_folder --config_file [--select ] [--timeout 5400] -``` - -**Arguments**: -- `--log_folder`: Directory to store benchmark logs (required) -- `--config_file`: Path to YAML configuration file (required) -- `--select`: Select pattern for specific Server and Client Config. (optional, default: all test cases) -- `--timeout`: Timeout for server setup. (optional, default: 3600 seconds) - -**Examples**: -```bash -# Select -python run_benchmark_serve.py --log_folder ./results --config_file benchmark_config.yaml --select "r1_fp4_dep4:con8_iter1_1024_1024,r1_fp4_tep4:con1_iter1_1024_1024" - -``` - -### 3. `parse_benchmark_results.py` - Results Parser -**Purpose**: Print log's perf. - -**Arguments**: -- `--log_folder`: Directory to store benchmark logs (required) - -**Usage**: -```bash -python parse_benchmark_results.py --log_folder -``` - - -### 4. `benchmark-serve.sh` - SLURM Job Script -**Usage**: -```bash -sbatch benchmark-serve.sh [IMAGE] [bench_dir] [log_folder] [select_pattern] -``` - -**Parameters**: -- `IMAGE`: Docker image (default: tensorrt-llm-staging/release:main-x86_64) -- `bench_dir`: Directory containing config file and benchmark scripts (default: current directory) -- `log_folder`: Directory containing output logs and csv. (default: current directory) -- `select_pattern`: Select pattern (default: default - all test cases) - -**Examples**: -```bash - -bench_dir="/path/to/benchmark/scripts" -log_folder="/path/to/store/output/files" -sbatch --reservation=RES--COM-3970 --qos=reservation -D ${log_folder} ${bench_dir}/benchmark-serve.sh urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm-staging/release:main-x86_64 ${bench_dir} ${log_folder} "r1_fp4_dep4:con8_iter1_1024_1024,r1_fp4_tep4:con1_iter1_1024_1024" - + random_range_ratio: 0.8 + backend: "openai" ``` diff --git a/tests/scripts/perf-sanity/l0_dgx_b200.yaml b/tests/scripts/perf-sanity/l0_dgx_b200.yaml index d8fccb78ef..17679d4ac8 100644 --- a/tests/scripts/perf-sanity/l0_dgx_b200.yaml +++ b/tests/scripts/perf-sanity/l0_dgx_b200.yaml @@ -1,58 +1,293 @@ server_configs: - - name: "r1_fp4_dep4" - model_name: "deepseek_r1_0528_fp4" - gpus: 4 - tp: 4 - ep: 4 - pp: 1 - attention_backend: "TRTLLM" - moe_backend: "CUTLASS" + - name: "r1_fp8_dep8_mtp1_1k1k" + model_name: "deepseek_r1_0528_fp8" + gpus: 8 + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + max_batch_size: 512 + max_num_tokens: 8192 + attn_backend: "TRTLLM" enable_attention_dp: true - enable_chunked_prefill: false - max_num_tokens: 2176 - kv_cache_dtype: "fp8" - free_gpu_memory_fraction: 0.8 - max_batch_size: 256 - enable_padding: true + attention_dp_config: + batching_wait_iters: 0 + enable_balance: true + timeout_iters: 60 + moe_config: + backend: 'DEEPGEMM' + cuda_graph_config: + enable_padding: true + max_batch_size: 512 + kv_cache_config: + dtype: 'fp8' + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + speculative_config: + decoding_type: 'MTP' + num_nextn_predict_layers: 1 client_configs: - - name: "con1_iter1_1024_1024" - concurrency: 1 - iterations: 1 + - name: "con4096_iter10_1k1k" + concurrency: 4096 + iterations: 10 isl: 1024 osl: 1024 - random_range_ratio: 0.0 - - name: "con8_iter1_1024_1024" - concurrency: 8 - iterations: 1 - isl: 1024 - osl: 1024 - random_range_ratio: 0.0 + random_range_ratio: 0.8 + backend: "openai" - - name: "r1_fp4_tep4" - model_name: "deepseek_r1_0528_fp4" - gpus: 4 - tp: 4 - ep: 4 - pp: 1 - attention_backend: "TRTLLM" - moe_backend: "CUTLASS" + - name: "r1_fp8_tep8_mtp3_1k1k" + model_name: "deepseek_r1_0528_fp8" + gpus: 8 + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + max_batch_size: 64 + max_num_tokens: 8192 + attn_backend: "TRTLLM" enable_attention_dp: false - enable_chunked_prefill: false - max_num_tokens: 2176 - kv_cache_dtype: "fp8" - free_gpu_memory_fraction: 0.8 - max_batch_size: 256 - enable_padding: true + moe_config: + backend: 'DEEPGEMM' + cuda_graph_config: + enable_padding: true + max_batch_size: 64 + kv_cache_config: + dtype: 'fp8' + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + speculative_config: + decoding_type: 'MTP' + num_nextn_predict_layers: 3 client_configs: - - name: "con1_iter1_1024_1024" - concurrency: 1 - iterations: 1 + - name: "con64_iter10_1k1k" + concurrency: 64 + iterations: 10 isl: 1024 osl: 1024 - random_range_ratio: 0.0 - - name: "con8_iter1_1024_1024" + random_range_ratio: 0.8 + backend: "openai" + + - name: "r1_fp8_tp8_mtp3_1k1k" + model_name: "deepseek_r1_0528_fp8" + gpus: 8 + tensor_parallel_size: 8 + moe_expert_parallel_size: 1 + pipeline_parallel_size: 1 + max_batch_size: 8 + max_num_tokens: 8192 + attn_backend: "TRTLLM" + enable_attention_dp: false + moe_config: + backend: 'TRTLLM' + cuda_graph_config: + enable_padding: true + max_batch_size: 8 + kv_cache_config: + dtype: 'fp8' + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + speculative_config: + decoding_type: 'MTP' + num_nextn_predict_layers: 3 + client_configs: + - name: "con8_iter10_1k1k" concurrency: 8 - iterations: 1 + iterations: 10 isl: 1024 osl: 1024 - random_range_ratio: 0.0 + random_range_ratio: 0.8 + backend: "openai" + + - name: "r1_fp4_v2_dep4_mtp1_1k1k" + model_name: "deepseek_r1_0528_fp4_v2" + gpus: 4 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + max_batch_size: 512 + max_num_tokens: 8192 + attn_backend: "TRTLLM" + enable_attention_dp: true + attention_dp_config: + batching_wait_iters: 0 + enable_balance: true + timeout_iters: 60 + moe_config: + backend: 'CUTLASS' + cuda_graph_config: + enable_padding: true + max_batch_size: 512 + kv_cache_config: + dtype: 'fp8' + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + speculative_config: + decoding_type: 'MTP' + num_nextn_predict_layers: 1 + client_configs: + - name: "con2048_iter10_1k1k" + concurrency: 2048 + iterations: 10 + isl: 1024 + osl: 1024 + random_range_ratio: 0.8 + backend: "openai" + + - name: "r1_fp4_v2_tep4_mtp3_1k1k" + model_name: "deepseek_r1_0528_fp4_v2" + gpus: 4 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + max_batch_size: 32 + max_num_tokens: 8192 + attn_backend: "TRTLLM" + enable_attention_dp: false + moe_config: + backend: 'TRTLLM' + cuda_graph_config: + enable_padding: true + max_batch_size: 32 + kv_cache_config: + dtype: 'fp8' + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + speculative_config: + decoding_type: 'MTP' + num_nextn_predict_layers: 3 + client_configs: + - name: "con32_iter10_1k1k" + concurrency: 32 + iterations: 10 + isl: 1024 + osl: 1024 + random_range_ratio: 0.8 + backend: "openai" + + - name: "r1_fp4_v2_tp4_mtp3_1k1k" + model_name: "deepseek_r1_0528_fp4_v2" + gpus: 4 + tensor_parallel_size: 4 + moe_expert_parallel_size: 1 + pipeline_parallel_size: 1 + max_batch_size: 4 + max_num_tokens: 8192 + attn_backend: "TRTLLM" + enable_attention_dp: false + moe_config: + backend: 'TRTLLM' + cuda_graph_config: + enable_padding: true + max_batch_size: 4 + kv_cache_config: + dtype: 'fp8' + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + speculative_config: + decoding_type: 'MTP' + num_nextn_predict_layers: 3 + client_configs: + - name: "con4_iter10_1k1k" + concurrency: 4 + iterations: 10 + isl: 1024 + osl: 1024 + random_range_ratio: 0.8 + backend: "openai" + + - name: "gpt_oss_fp4_dep2_1k1k" + model_name: "gpt_oss_120b_fp4" + gpus: 2 + tensor_parallel_size: 2 + moe_expert_parallel_size: 2 + pipeline_parallel_size: 1 + max_batch_size: 1024 + max_num_tokens: 20000 + attn_backend: "TRTLLM" + enable_attention_dp: true + attention_dp_config: + enable_balance: true + moe_config: + backend: 'TRTLLM' + cuda_graph_config: + enable_padding: true + max_batch_size: 1024 + kv_cache_config: + dtype: 'fp8' + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + num_postprocess_workers: 4 + stream_interval: 20 + client_configs: + - name: "con2048_iter5_1k1k" + concurrency: 2048 + iterations: 5 + isl: 1024 + osl: 1024 + random_range_ratio: 0.8 + backend: "openai" + + - name: "gpt_oss_fp4_dep4_1k1k" + model_name: "gpt_oss_120b_fp4" + gpus: 4 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + max_batch_size: 512 + max_num_tokens: 20000 + attn_backend: "TRTLLM" + enable_attention_dp: true + attention_dp_config: + enable_balance: true + moe_config: + backend: 'TRTLLM' + cuda_graph_config: + enable_padding: true + max_batch_size: 512 + kv_cache_config: + dtype: 'fp8' + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + num_postprocess_workers: 4 + stream_interval: 20 + client_configs: + - name: "con2048_iter5_1k1k" + concurrency: 2048 + iterations: 5 + isl: 1024 + osl: 1024 + random_range_ratio: 0.8 + backend: "openai" + + - name: "gpt_oss_fp4_tp4_eagle3_1k1k" + model_name: "gpt_oss_120b_fp4" + gpus: 4 + tensor_parallel_size: 4 + moe_expert_parallel_size: 1 + pipeline_parallel_size: 1 + max_batch_size: 1 + max_num_tokens: 20000 + attn_backend: "TRTLLM" + enable_attention_dp: false + moe_config: + backend: 'TRTLLM' + cuda_graph_config: + enable_padding: true + max_batch_size: 1 + kv_cache_config: + dtype: 'fp8' + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + speculative_config: + decoding_type: 'Eagle' + eagle3_layers_to_capture: [-1] + max_draft_len: 3 + speculative_model_dir: "gpt_oss/gpt-oss-120b-Eagle3" + stream_interval: 20 + num_postprocess_workers: 4 + client_configs: + - name: "con1_iter32_1k1k" + concurrency: 1 + iterations: 32 + isl: 1024 + osl: 1024 + random_range_ratio: 0.8 + backend: "openai" diff --git a/tests/scripts/perf-sanity/l0_dgx_b300.yaml b/tests/scripts/perf-sanity/l0_dgx_b300.yaml index d8fccb78ef..b19ca77812 100644 --- a/tests/scripts/perf-sanity/l0_dgx_b300.yaml +++ b/tests/scripts/perf-sanity/l0_dgx_b300.yaml @@ -1,58 +1,194 @@ server_configs: - - name: "r1_fp4_dep4" - model_name: "deepseek_r1_0528_fp4" - gpus: 4 - tp: 4 - ep: 4 - pp: 1 - attention_backend: "TRTLLM" - moe_backend: "CUTLASS" + - name: "r1_fp8_dep8_mtp1_1k1k" + model_name: "deepseek_r1_0528_fp8" + gpus: 8 + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + max_batch_size: 512 + max_num_tokens: 8192 + attn_backend: "TRTLLM" enable_attention_dp: true - enable_chunked_prefill: false - max_num_tokens: 2176 - kv_cache_dtype: "fp8" - free_gpu_memory_fraction: 0.8 - max_batch_size: 256 - enable_padding: true + attention_dp_config: + batching_wait_iters: 0 + enable_balance: true + timeout_iters: 60 + moe_config: + backend: 'DEEPGEMM' + cuda_graph_config: + enable_padding: true + max_batch_size: 512 + kv_cache_config: + dtype: 'fp8' + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + speculative_config: + decoding_type: 'MTP' + num_nextn_predict_layers: 1 client_configs: - - name: "con1_iter1_1024_1024" - concurrency: 1 - iterations: 1 + - name: "con4096_iter10_1k1k" + concurrency: 4096 + iterations: 10 isl: 1024 osl: 1024 - random_range_ratio: 0.0 - - name: "con8_iter1_1024_1024" - concurrency: 8 - iterations: 1 - isl: 1024 - osl: 1024 - random_range_ratio: 0.0 + random_range_ratio: 0.8 + backend: "openai" - - name: "r1_fp4_tep4" - model_name: "deepseek_r1_0528_fp4" - gpus: 4 - tp: 4 - ep: 4 - pp: 1 - attention_backend: "TRTLLM" - moe_backend: "CUTLASS" + - name: "r1_fp8_tep8_mtp3_1k1k" + model_name: "deepseek_r1_0528_fp8" + gpus: 8 + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + max_batch_size: 64 + max_num_tokens: 8192 + attn_backend: "TRTLLM" enable_attention_dp: false - enable_chunked_prefill: false - max_num_tokens: 2176 - kv_cache_dtype: "fp8" - free_gpu_memory_fraction: 0.8 - max_batch_size: 256 - enable_padding: true + moe_config: + backend: 'DEEPGEMM' + cuda_graph_config: + enable_padding: true + max_batch_size: 64 + kv_cache_config: + dtype: 'fp8' + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + speculative_config: + decoding_type: 'MTP' + num_nextn_predict_layers: 3 client_configs: - - name: "con1_iter1_1024_1024" - concurrency: 1 - iterations: 1 + - name: "con64_iter10_1k1k" + concurrency: 64 + iterations: 10 isl: 1024 osl: 1024 - random_range_ratio: 0.0 - - name: "con8_iter1_1024_1024" + random_range_ratio: 0.8 + backend: "openai" + + - name: "r1_fp8_tp8_mtp3_1k1k" + model_name: "deepseek_r1_0528_fp8" + gpus: 8 + tensor_parallel_size: 8 + moe_expert_parallel_size: 1 + pipeline_parallel_size: 1 + max_batch_size: 8 + max_num_tokens: 8192 + attn_backend: "TRTLLM" + enable_attention_dp: false + moe_config: + backend: 'TRTLLM' + cuda_graph_config: + enable_padding: true + max_batch_size: 8 + kv_cache_config: + dtype: 'fp8' + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + speculative_config: + decoding_type: 'MTP' + num_nextn_predict_layers: 3 + client_configs: + - name: "con8_iter10_1k1k" concurrency: 8 - iterations: 1 + iterations: 10 isl: 1024 osl: 1024 - random_range_ratio: 0.0 + random_range_ratio: 0.8 + backend: "openai" + + - name: "r1_fp4_v2_dep4_mtp1_1k1k" + model_name: "deepseek_r1_0528_fp4_v2" + gpus: 4 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + max_batch_size: 512 + max_num_tokens: 8192 + attn_backend: "TRTLLM" + enable_attention_dp: true + attention_dp_config: + batching_wait_iters: 0 + enable_balance: true + timeout_iters: 60 + moe_config: + backend: 'CUTLASS' + cuda_graph_config: + enable_padding: true + max_batch_size: 512 + kv_cache_config: + dtype: 'fp8' + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + speculative_config: + decoding_type: 'MTP' + num_nextn_predict_layers: 1 + client_configs: + - name: "con2048_iter10_1k1k" + concurrency: 2048 + iterations: 10 + isl: 1024 + osl: 1024 + random_range_ratio: 0.8 + backend: "openai" + + - name: "r1_fp4_v2_tep4_mtp3_1k1k" + model_name: "deepseek_r1_0528_fp4_v2" + gpus: 4 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + max_batch_size: 32 + max_num_tokens: 8192 + attn_backend: "TRTLLM" + enable_attention_dp: false + moe_config: + backend: 'TRTLLM' + cuda_graph_config: + enable_padding: true + max_batch_size: 32 + kv_cache_config: + dtype: 'fp8' + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + speculative_config: + decoding_type: 'MTP' + num_nextn_predict_layers: 3 + client_configs: + - name: "con32_iter10_1k1k" + concurrency: 32 + iterations: 10 + isl: 1024 + osl: 1024 + random_range_ratio: 0.8 + backend: "openai" + + - name: "r1_fp4_v2_tp4_mtp3_1k1k" + model_name: "deepseek_r1_0528_fp4_v2" + gpus: 4 + tensor_parallel_size: 4 + moe_expert_parallel_size: 1 + pipeline_parallel_size: 1 + max_batch_size: 4 + max_num_tokens: 8192 + attn_backend: "TRTLLM" + enable_attention_dp: false + moe_config: + backend: 'TRTLLM' + cuda_graph_config: + enable_padding: true + max_batch_size: 4 + kv_cache_config: + dtype: 'fp8' + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + speculative_config: + decoding_type: 'MTP' + num_nextn_predict_layers: 3 + client_configs: + - name: "con4_iter10_1k1k" + concurrency: 4 + iterations: 10 + isl: 1024 + osl: 1024 + random_range_ratio: 0.8 + backend: "openai" diff --git a/tests/scripts/perf-sanity/l0_gb200_multi_gpus.yaml b/tests/scripts/perf-sanity/l0_gb200_multi_gpus.yaml new file mode 100644 index 0000000000..8e8efc1bc3 --- /dev/null +++ b/tests/scripts/perf-sanity/l0_gb200_multi_gpus.yaml @@ -0,0 +1,294 @@ +server_configs: + # 1k1k configs + - name: "r1_fp4_v2_dep4_mtp1_1k1k" + model_name: "deepseek_r1_0528_fp4_v2" + gpus: 4 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + max_batch_size: 512 + max_num_tokens: 8192 + attn_backend: "TRTLLM" + enable_attention_dp: true + attention_dp_config: + batching_wait_iters: 0 + enable_balance: true + timeout_iters: 60 + moe_config: + backend: 'CUTLASS' + cuda_graph_config: + enable_padding: true + max_batch_size: 512 + kv_cache_config: + dtype: 'fp8' + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + speculative_config: + decoding_type: 'MTP' + num_nextn_predict_layers: 1 + client_configs: + - name: "con2048_iter10_1k1k" + concurrency: 2048 + iterations: 10 + isl: 1024 + osl: 1024 + random_range_ratio: 0.8 + backend: "openai" + + - name: "r1_fp4_v2_tep4_mtp3_1k1k" + model_name: "deepseek_r1_0528_fp4_v2" + gpus: 4 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + max_batch_size: 32 + max_num_tokens: 8192 + attn_backend: "TRTLLM" + enable_attention_dp: false + moe_config: + backend: 'TRTLLM' + cuda_graph_config: + enable_padding: true + max_batch_size: 32 + kv_cache_config: + dtype: 'fp8' + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + speculative_config: + decoding_type: 'MTP' + num_nextn_predict_layers: 3 + client_configs: + - name: "con32_iter10_1k1k" + concurrency: 32 + iterations: 10 + isl: 1024 + osl: 1024 + random_range_ratio: 0.8 + backend: "openai" + + - name: "r1_fp4_v2_tp4_mtp3_1k1k" + model_name: "deepseek_r1_0528_fp4_v2" + gpus: 4 + tensor_parallel_size: 4 + moe_expert_parallel_size: 1 + pipeline_parallel_size: 1 + max_batch_size: 4 + max_num_tokens: 8192 + attn_backend: "TRTLLM" + enable_attention_dp: false + moe_config: + backend: 'TRTLLM' + cuda_graph_config: + enable_padding: true + max_batch_size: 4 + kv_cache_config: + dtype: 'fp8' + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + speculative_config: + decoding_type: 'MTP' + num_nextn_predict_layers: 3 + client_configs: + - name: "con4_iter10_1k1k" + concurrency: 4 + iterations: 10 + isl: 1024 + osl: 1024 + random_range_ratio: 0.8 + backend: "openai" + + # 8k1k configs + - name: "r1_fp4_v2_dep4_mtp1_8k1k" + model_name: "deepseek_r1_0528_fp4_v2" + gpus: 4 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + max_batch_size: 512 + max_num_tokens: 10304 + attn_backend: "TRTLLM" + enable_attention_dp: true + attention_dp_config: + batching_wait_iters: 0 + enable_balance: true + timeout_iters: 60 + moe_config: + backend: 'CUTLASS' + cuda_graph_config: + enable_padding: true + max_batch_size: 512 + kv_cache_config: + dtype: 'fp8' + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + speculative_config: + decoding_type: 'MTP' + num_nextn_predict_layers: 1 + client_configs: + - name: "con2048_iter10_8k1k" + concurrency: 2048 + iterations: 10 + isl: 8192 + osl: 1024 + random_range_ratio: 0.8 + backend: "openai" + + - name: "r1_fp4_v2_tep4_mtp3_8k1k" + model_name: "deepseek_r1_0528_fp4_v2" + gpus: 4 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + max_batch_size: 32 + max_num_tokens: 10304 + attn_backend: "TRTLLM" + enable_attention_dp: false + moe_config: + backend: 'TRTLLM' + cuda_graph_config: + enable_padding: true + max_batch_size: 32 + kv_cache_config: + dtype: 'fp8' + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + speculative_config: + decoding_type: 'MTP' + num_nextn_predict_layers: 3 + client_configs: + - name: "con32_iter10_8k1k" + concurrency: 32 + iterations: 10 + isl: 8192 + osl: 1024 + random_range_ratio: 0.8 + backend: "openai" + + - name: "r1_fp4_v2_tp4_mtp3_8k1k" + model_name: "deepseek_r1_0528_fp4_v2" + gpus: 4 + tensor_parallel_size: 4 + moe_expert_parallel_size: 1 + pipeline_parallel_size: 1 + max_batch_size: 4 + max_num_tokens: 10304 + attn_backend: "TRTLLM" + enable_attention_dp: false + moe_config: + backend: 'TRTLLM' + cuda_graph_config: + enable_padding: true + max_batch_size: 4 + kv_cache_config: + dtype: 'fp8' + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + speculative_config: + decoding_type: 'MTP' + num_nextn_predict_layers: 3 + client_configs: + - name: "con4_iter10_8k1k" + concurrency: 4 + iterations: 10 + isl: 8192 + osl: 1024 + random_range_ratio: 0.8 + backend: "openai" + + # 1k8k configs + - name: "r1_fp4_v2_dep4_mtp1_1k8k" + model_name: "deepseek_r1_0528_fp4_v2" + gpus: 4 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + max_batch_size: 512 + max_num_tokens: 8192 + attn_backend: "TRTLLM" + enable_attention_dp: true + attention_dp_config: + batching_wait_iters: 0 + enable_balance: true + timeout_iters: 60 + moe_config: + backend: 'CUTLASS' + cuda_graph_config: + enable_padding: true + max_batch_size: 512 + kv_cache_config: + dtype: 'fp8' + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + speculative_config: + decoding_type: 'MTP' + num_nextn_predict_layers: 1 + client_configs: + - name: "con2048_iter10_1k8k" + concurrency: 2048 + iterations: 10 + isl: 1024 + osl: 8192 + random_range_ratio: 0.8 + backend: "openai" + + - name: "r1_fp4_v2_tep4_mtp3_1k8k" + model_name: "deepseek_r1_0528_fp4_v2" + gpus: 4 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + max_batch_size: 32 + max_num_tokens: 8192 + attn_backend: "TRTLLM" + enable_attention_dp: false + moe_config: + backend: 'TRTLLM' + cuda_graph_config: + enable_padding: true + max_batch_size: 32 + kv_cache_config: + dtype: 'fp8' + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + speculative_config: + decoding_type: 'MTP' + num_nextn_predict_layers: 3 + client_configs: + - name: "con32_iter10_1k8k" + concurrency: 32 + iterations: 10 + isl: 1024 + osl: 8192 + random_range_ratio: 0.8 + backend: "openai" + + - name: "r1_fp4_v2_tp4_mtp3_1k8k" + model_name: "deepseek_r1_0528_fp4_v2" + gpus: 4 + tensor_parallel_size: 4 + moe_expert_parallel_size: 1 + pipeline_parallel_size: 1 + max_batch_size: 4 + max_num_tokens: 8192 + attn_backend: "TRTLLM" + enable_attention_dp: false + moe_config: + backend: 'TRTLLM' + cuda_graph_config: + enable_padding: true + max_batch_size: 4 + kv_cache_config: + dtype: 'fp8' + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + speculative_config: + decoding_type: 'MTP' + num_nextn_predict_layers: 3 + client_configs: + - name: "con4_iter10_1k8k" + concurrency: 4 + iterations: 10 + isl: 1024 + osl: 8192 + random_range_ratio: 0.8 + backend: "openai" diff --git a/tests/scripts/perf-sanity/l0_gb200_multi_nodes.yaml b/tests/scripts/perf-sanity/l0_gb200_multi_nodes.yaml new file mode 100644 index 0000000000..3dcdc83684 --- /dev/null +++ b/tests/scripts/perf-sanity/l0_gb200_multi_nodes.yaml @@ -0,0 +1,71 @@ +# Hardware Config +hardware: + gpus_per_node: 4 + gpus_per_server: 8 + +server_configs: + - name: "r1_fp4_v2_dep8_mtp1" + model_name: "deepseek_r1_0528_fp4_v2" + gpus: 8 + gpus_per_node: 4 + trust_remote_code: true + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + max_batch_size: 512 + max_num_tokens: 2112 + attn_backend: "TRTLLM" + enable_attention_dp: true + attention_dp_config: + batching_wait_iters: 0 + enable_balance: true + timeout_iters: 60 + moe_config: + backend: 'CUTLASS' + cuda_graph_config: + enable_padding: true + max_batch_size: 512 + kv_cache_config: + dtype: 'fp8' + enable_block_reuse: false + free_gpu_memory_fraction: 0.5 + client_configs: + - name: "con32_iter12_1k1k" + concurrency: 32 + iterations: 12 + isl: 1024 + osl: 1024 + random_range_ratio: 0.8 + backend: "openai" + + - name: "r1_fp4_v2_tep8_mtp3" + model_name: "deepseek_r1_0528_fp4_v2" + gpus: 8 + gpus_per_node: 4 + trust_remote_code: true + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + max_batch_size: 512 + max_num_tokens: 3136 + attn_backend: "TRTLLM" + enable_attention_dp: false + moe_config: + backend: "TRTLLM" + cuda_graph_config: + enable_padding: true + max_batch_size: 512 + kv_cache_config: + dtype: 'fp8' + free_gpu_memory_fraction: 0.5 + speculative_config: + decoding_type: 'MTP' + num_nextn_predict_layers: 3 + client_configs: + - name: "con32_iter12_1k1k" + concurrency: 32 + iterations: 12 + isl: 1024 + osl: 1024 + random_range_ratio: 0.8 + backend: "openai" From 4da0e1473cf244c7035a7bfc8155ad8bb0bc0238 Mon Sep 17 00:00:00 2001 From: Zheng Duan <200704041+zhengd-nv@users.noreply.github.com> Date: Mon, 8 Dec 2025 09:51:10 +0800 Subject: [PATCH 06/10] [None][test] add ntp tolerance in time metrics verification (#9741) Signed-off-by: zhengd-nv <200704041+zhengd-nv@users.noreply.github.com> --- .../integration/defs/disaggregated/test_disaggregated.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/tests/integration/defs/disaggregated/test_disaggregated.py b/tests/integration/defs/disaggregated/test_disaggregated.py index b2db88f0d2..bb811de4d1 100644 --- a/tests/integration/defs/disaggregated/test_disaggregated.py +++ b/tests/integration/defs/disaggregated/test_disaggregated.py @@ -114,13 +114,16 @@ def validate_timing_metrics(perf_metrics_item, request_context=""): )), f"gen server_first_token_time is not numeric in {request_context}" assert gen_server_arrival <= gen_server_first_token, f"gen server_arrival_time > server_first_token_time in {request_context}" + # Network Time Protocol can ensure ms-level accuracy in LAN + ntp_tolerance = 1e-3 + # Validate timing relationships between different levels # Disaggregated server should receive request before individual servers - assert disagg_arrival <= ctx_server_arrival, f"disagg_arrival > ctx_server_arrival in {request_context}" - assert disagg_arrival <= gen_server_arrival, f"disagg_arrival > gen_server_arrival in {request_context}" + assert disagg_arrival - ntp_tolerance <= ctx_server_arrival, f"disagg_arrival > ctx_server_arrival in {request_context}" + assert disagg_arrival - ntp_tolerance <= gen_server_arrival, f"disagg_arrival > gen_server_arrival in {request_context}" # Context should complete before generation starts - assert ctx_server_first_token <= gen_server_arrival, f"ctx_server_first_token > gen_server_arrival in {request_context}" + assert ctx_server_first_token - ntp_tolerance <= gen_server_arrival, f"ctx_server_first_token > gen_server_arrival in {request_context}" # Validate internal timing consistency ctx_arrival_time = ctx_metrics["arrival_time"] From 8e27ce7084d9fab1051e88fc945732e59689761b Mon Sep 17 00:00:00 2001 From: xxi <95731198+xxi-nv@users.noreply.github.com> Date: Mon, 8 Dec 2025 10:19:40 +0800 Subject: [PATCH 07/10] [TRTLLM-9603][feat] Enable ConfigurableMoE test in the CI (#9645) --- .../modules/fused_moe/configurable_moe.py | 29 +++-- .../defs/accuracy/test_llm_api_pytorch.py | 94 ++++++++++++-- tests/integration/defs/conftest.py | 92 ++++++++++++++ .../test_lists/test-db/l0_dgx_b200.yml | 10 ++ tests/unittest/_torch/modules/conftest.py | 118 ++++++++++++++++++ .../unittest/_torch/modules/test_fused_moe.py | 62 ++++++++- 6 files changed, 382 insertions(+), 23 deletions(-) create mode 100644 tests/unittest/_torch/modules/conftest.py diff --git a/tensorrt_llm/_torch/modules/fused_moe/configurable_moe.py b/tensorrt_llm/_torch/modules/fused_moe/configurable_moe.py index 717d8f78fe..c7df8e1f9a 100644 --- a/tensorrt_llm/_torch/modules/fused_moe/configurable_moe.py +++ b/tensorrt_llm/_torch/modules/fused_moe/configurable_moe.py @@ -170,18 +170,23 @@ class ConfigurableMoE(MoE): # ConfigurableMoE's super().__init__() was called with real layer_idx and initialized load balancer. # Backend was created with init_load_balancer=False and without_comm=True to avoid # duplicate initialization. Now sync all attributes from ConfigurableMoE to backend. - self.backend.aux_stream_dict = self.aux_stream_dict - self.backend.layer_idx = self.layer_idx - self.backend.layer_idx_str = self.layer_idx_str - self.backend.num_slots = self.num_slots - self.backend.layer_load_balancer = self.layer_load_balancer - self.backend.repeat_count = self.repeat_count - self.backend.repeat_idx = self.repeat_idx - self.backend.initial_local_expert_ids = self.initial_local_expert_ids - self.backend.initial_global_assignments = self.initial_global_assignments - self.backend.slot_start = self.slot_start - self.backend.slot_end = self.slot_end - self.backend.expert_size_per_partition = self.expert_size_per_partition + if self.backend is not None: + # Add a check to WAR the issue that the backend is none during torch.compile + assert not torch.compiler.is_compiling(), ( + "Backend should not be none if not in torch.compile" + ) + self.backend.aux_stream_dict = self.aux_stream_dict + self.backend.layer_idx = self.layer_idx + self.backend.layer_idx_str = self.layer_idx_str + self.backend.num_slots = self.num_slots + self.backend.layer_load_balancer = self.layer_load_balancer + self.backend.repeat_count = self.repeat_count + self.backend.repeat_idx = self.repeat_idx + self.backend.initial_local_expert_ids = self.initial_local_expert_ids + self.backend.initial_global_assignments = self.initial_global_assignments + self.backend.slot_start = self.slot_start + self.backend.slot_end = self.slot_end + self.backend.expert_size_per_partition = self.expert_size_per_partition # Create weights here, because the backend needs the layer_load_balancer info to create weights model_config._frozen = False diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py index f5396fc8a6..09b1613f75 100644 --- a/tests/integration/defs/accuracy/test_llm_api_pytorch.py +++ b/tests/integration/defs/accuracy/test_llm_api_pytorch.py @@ -13,9 +13,37 @@ # See the License for the specific language governing permissions and # limitations under the License. import os +import sys import pytest import torch +from mpi4py.futures import MPIPoolExecutor + + +def patch_mpi_pool_session_for_env(mocker, env_vars: dict): + """ + Patch MpiPoolSession._start_mpi_pool to propagate environment variables to MPI child processes. + + Uses MPIPoolExecutor's built-in `env` parameter instead of `initializer` to avoid + segfault issues during process cleanup (UCX memory cache conflicts with PyTorch + tensor cleanup during Py_FinalizeEx). + + Args: + mocker: pytest-mock mocker fixture + env_vars: Dictionary of environment variable name -> value to propagate + """ + from tensorrt_llm.llmapi.mpi_session import MpiPoolSession + + def patched_start_mpi_pool(self): + assert not self.mpi_pool, 'MPI session already started' + self.mpi_pool = MPIPoolExecutor(max_workers=self.n_workers, + path=sys.path, + env=env_vars) + + mocker.patch.object(MpiPoolSession, '_start_mpi_pool', + patched_start_mpi_pool) + + from defs.conftest import get_sm_version, is_sm_100f from tensorrt_llm import LLM @@ -1830,9 +1858,24 @@ class TestDeepSeekV3Lite(LlmapiAccuracyTestHarness): ids=["tp4", "ep4", "tp2pp2", "pp4"]) @parametrize_with_ids("mtp_nextn", [0, 2]) @parametrize_with_ids("moe_backend", ["CUTLASS", "TRTLLM", "CUTEDSL"]) + @pytest.mark.parametrize("enable_configurable_moe", [0, 1], + ids=lambda x: "" + if x == 0 else "enable_configurable_moe") def test_nvfp4_4gpus(self, fp8kv, attention_dp, cuda_graph, overlap_scheduler, tp_size, pp_size, ep_size, - torch_compile, mtp_nextn, moe_backend): + torch_compile, mtp_nextn, moe_backend, + enable_configurable_moe, mocker): + # Handle ENABLE_CONFIGURABLE_MOE environment variable + if enable_configurable_moe == 1 and moe_backend != "TRTLLM": + pytest.skip( + f"ENABLE_CONFIGURABLE_MOE=1 is only supported with TRTLLM backend, " + f"current backend is {moe_backend}") + + # Patch MpiPoolSession to propagate env vars to MPI worker processes + env_value = "1" if enable_configurable_moe == 1 and moe_backend == "TRTLLM" else "0" + patch_mpi_pool_session_for_env(mocker, + {"ENABLE_CONFIGURABLE_MOE": env_value}) + if moe_backend == "TRTLLM" and (get_sm_version() == 120 or get_sm_version() == 121): pytest.skip( @@ -3452,9 +3495,23 @@ class TestQwen3_30B_A3B(LlmapiAccuracyTestHarness): ids=["latency", "ep2", "ep4"]) @pytest.mark.parametrize("activation_dtype", ["static_fp8", "mxfp8"], ids=["fp8", "mxfp8"]) + @pytest.mark.parametrize("enable_configurable_moe", [0, 1], + ids=lambda x: "" + if x == 0 else "enable_configurable_moe") def test_w4a8_mxfp4(self, moe_backend, tp_size, pp_size, ep_size, attention_dp, cuda_graph, overlap_scheduler, - activation_dtype): + activation_dtype, enable_configurable_moe, mocker): + # Handle ENABLE_CONFIGURABLE_MOE environment variable + if enable_configurable_moe == 1 and moe_backend != "TRTLLM": + pytest.skip( + f"ENABLE_CONFIGURABLE_MOE=1 is only supported with TRTLLM backend, " + f"current backend is {moe_backend}") + + # Patch MpiPoolSession to propagate env vars to MPI worker processes + env_value = "1" if enable_configurable_moe == 1 and moe_backend == "TRTLLM" else "0" + patch_mpi_pool_session_for_env(mocker, + {"ENABLE_CONFIGURABLE_MOE": env_value}) + if moe_backend == "TRITON": if not IS_TRITON_KERNELS_AVAILABLE: pytest.skip("TRITON moe backend is not available.") @@ -3906,9 +3963,23 @@ class TestGPTOSS(LlmapiAccuracyTestHarness): (4, 1, 4, True, True, True), ], ids=["tp4", "ep4", "dp4"]) + @pytest.mark.parametrize("enable_configurable_moe", [0, 1], + ids=lambda x: "" + if x == 0 else "enable_configurable_moe") def test_w4_4gpus(self, kv_cache_dtype, moe_backend, tp_size, pp_size, ep_size, attention_dp, cuda_graph, overlap_scheduler, - mocker): + enable_configurable_moe, mocker): + # Handle ENABLE_CONFIGURABLE_MOE environment variable + if enable_configurable_moe == 1 and moe_backend != "TRTLLM": + pytest.skip( + f"ENABLE_CONFIGURABLE_MOE=1 is only supported with TRTLLM backend, " + f"current backend is {moe_backend}") + + # Patch MpiPoolSession to propagate env vars to MPI worker processes + env_value = "1" if enable_configurable_moe == 1 and moe_backend == "TRTLLM" else "0" + patch_mpi_pool_session_for_env(mocker, + {"ENABLE_CONFIGURABLE_MOE": env_value}) + if moe_backend == "TRITON": if not IS_TRITON_KERNELS_AVAILABLE: pytest.skip("Triton kernels are not available") @@ -3925,7 +3996,8 @@ class TestGPTOSS(LlmapiAccuracyTestHarness): pytorch_config = dict( disable_overlap_scheduler=not overlap_scheduler, - cuda_graph_config=CudaGraphConfig() if cuda_graph else None) + cuda_graph_config=CudaGraphConfig() if cuda_graph else None, + moe_config=MoeConfig(backend=moe_backend)) kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.7, dtype=kv_cache_dtype) @@ -3939,8 +4011,7 @@ class TestGPTOSS(LlmapiAccuracyTestHarness): max_seq_len=max_seq_len, max_batch_size=720, **pytorch_config, - enable_attention_dp=attention_dp, - moe_config=MoeConfig(backend=moe_backend)) + enable_attention_dp=attention_dp) with llm: model_name = "GPT-OSS/120B-MXFP4" @@ -4252,8 +4323,17 @@ class TestGPTOSS(LlmapiAccuracyTestHarness): @pytest.mark.parametrize( "kv_cache_dtype", ["auto", pytest.param("fp8", marks=skip_pre_blackwell)]) - def test_w4_4gpus_online_eplb(self, kv_cache_dtype, mocker): + @pytest.mark.parametrize("enable_configurable_moe", [0, 1], + ids=lambda x: "" + if x == 0 else "enable_configurable_moe") + def test_w4_4gpus_online_eplb(self, kv_cache_dtype, enable_configurable_moe, + mocker): """Test GPTOSS with online expert parallel load balancer using TRTLLM backend and attention DP.""" + # Patch MpiPoolSession to propagate env vars to MPI worker processes + env_value = "1" if enable_configurable_moe == 1 else "0" + patch_mpi_pool_session_for_env(mocker, + {"ENABLE_CONFIGURABLE_MOE": env_value}) + mocker.patch.object(GSM8K, "MAX_OUTPUT_LEN", 8192) mocker.patch.dict(GSM8K.EVALUATE_KWARGS, {"scores_filter": "exact_match,flexible-extract"}) diff --git a/tests/integration/defs/conftest.py b/tests/integration/defs/conftest.py index ee374ef086..792eca22a7 100644 --- a/tests/integration/defs/conftest.py +++ b/tests/integration/defs/conftest.py @@ -2209,6 +2209,94 @@ def pytest_generate_tests(metafunc: pytest.Metafunc): metafunc.parametrize("case", uts, ids=lambda x: x) +# Test cases that use enable_configurable_moe parameter and need ID conversion +TESTS_WITH_CONFIGURABLE_MOE = [ + "TestDeepSeekV3Lite::test_nvfp4_4gpus", + "TestGPTOSS::test_w4_4gpus", + "TestGPTOSS::test_w4_4gpus_online_eplb", + "TestQwen3_30B_A3B::test_w4a8_mxfp4", +] + + +def _convert_clean_to_original_moe_test_id(test_id): + """Convert clean MoE test ID back to original format for pytest collection. + + Example: "test_llm_api_pytorch.py::test_foo[param]" -> "test_llm_api_pytorch.py::test_foo[-param]" + + This is needed because the `enable_configurable_moe` parameter uses empty string + as ID when value is 0, resulting in test IDs like "test_foo[-param]". + We clean these up in pytest_collection_modifyitems, but pytest filters tests + during collection using the original IDs. So when user runs with clean test name, + we need to convert it back to match the original. + """ + if "test_llm_api_pytorch.py" not in test_id: + return test_id + + # Match pattern like "test_name[params]" and add leading dash after "[" + # But only if params don't already start with "-" or "enable_configurable_moe" + match = re.search(r"\[([^\]]+)\]", test_id) + if match: + params = match.group(1) + # Skip if already has leading dash or starts with enable_configurable_moe + if not params.startswith("-") and not params.startswith( + "enable_configurable_moe"): + # Add leading dash to params + new_params = "-" + params + test_id = test_id.replace(f"[{params}]", f"[{new_params}]") + + return test_id + + +def pytest_sessionstart(session): + """Convert clean MoE test IDs in config.args to original format for collection. + + This is needed because pytest filters tests during collection using original IDs. + When user runs with clean test name, we convert it back to match the original. + """ + args = session.config.args + for i, arg in enumerate(args): + if "test_llm_api_pytorch.py" in arg and "[" in arg: + # Only apply conversion to specific tests that use enable_configurable_moe + should_convert = any(test_name in arg + for test_name in TESTS_WITH_CONFIGURABLE_MOE) + if should_convert: + args[i] = _convert_clean_to_original_moe_test_id(arg) + + +def _clean_moe_test_ids(items): + """Clean up test IDs by removing leading/trailing dashes from parameter IDs. + + This is needed because `enable_configurable_moe` parameter can be empty, + resulting in ugly test IDs like "test_foo[-True]" or "test_foo[--abc]". + We clean these up to "test_foo[True]" or "test_foo[abc]" so that: + 1. Test names in waive files and test lists remain unchanged + 2. Test reports look cleaner + """ + for item in items: + if "test_llm_api_pytorch.py" in item.nodeid and "[" in item.nodeid: + # Only apply cleanup to specific tests that use enable_configurable_moe + should_cleanup = any(test_name in item.nodeid + for test_name in TESTS_WITH_CONFIGURABLE_MOE) + if should_cleanup: + original_nodeid = item.nodeid + original_name = item.name + nodeid = item.nodeid + name = item.name + + # Clean up leading/trailing dashes in nodeid + nodeid = nodeid.replace("[-", "[") + nodeid = nodeid.replace("-]", "]") + + # Clean up leading/trailing dashes in name + name = name.replace("[-", "[") + name = name.replace("-]", "]") + + if nodeid != original_nodeid: + item._nodeid = nodeid + if name != original_name: + item.name = name + + @pytest.hookimpl(tryfirst=True, hookwrapper=True) def pytest_collection_modifyitems(session, config, items): testlist_path = config.getoption("--test-list") @@ -2217,6 +2305,10 @@ def pytest_collection_modifyitems(session, config, items): perf_test = config.getoption("--perf") test_model_suites = config.getoption("--test-model-suites") + # TODO Once the MoE refactor is complete, this should be removed. + # This is a temporary WAR to minimize the impact of the MoE refactor on the existing test lists. + _clean_moe_test_ids(items) + if perf_test: global ALL_PYTEST_ITEMS ALL_PYTEST_ITEMS = None diff --git a/tests/integration/test_lists/test-db/l0_dgx_b200.yml b/tests/integration/test_lists/test-db/l0_dgx_b200.yml index 1204d0c8e6..7bac4b180f 100644 --- a/tests/integration/test_lists/test-db/l0_dgx_b200.yml +++ b/tests/integration/test_lists/test-db/l0_dgx_b200.yml @@ -17,6 +17,10 @@ l0_dgx_b200: tests: - unittest/_torch/modules/test_fused_moe.py::test_fused_moe_alltoall_fp4[DeepEPLowLatency] - unittest/_torch/modules/test_fused_moe.py::test_fused_moe_alltoall_fp4[MNNVL] + - unittest/_torch/modules/test_fused_moe.py::test_fused_moe_nvfp4[enable_configurable_moe-TRTLLM-dtype1] + - unittest/_torch/modules/test_fused_moe.py::test_fused_moe_w4a8_nvfp4_fp8[enable_configurable_moe-TRTLLM] + - unittest/_torch/modules/test_fused_moe.py::test_fused_moe_mxfp4_mxfp8[enable_configurable_moe-True-8-64-TRTLLM] + - unittest/_torch/modules/test_fused_moe.py::test_fused_moe_wfp4a16[enable_configurable_moe-TRTLLM-2880-dtype0] - accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_bfloat16_4gpus[pp4-attn_backend=TRTLLM-torch_compile=False] - accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_4gpus[tp4-fp8kv=True-attn_backend=TRTLLM-torch_compile=False] - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp4-mtp_nextn=2-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=True] @@ -158,6 +162,8 @@ l0_dgx_b200: - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[pp4-mtp_nextn=2-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=True] - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[pp4-mtp_nextn=0-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False] - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[pp4-mtp_nextn=2-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[enable_configurable_moe-moe_backend=TRTLLM-mtp_nextn=0-tp4-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False] + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[enable_configurable_moe-moe_backend=TRTLLM-mtp_nextn=0-ep4-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=True] - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=0-tp4-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=True] - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=0-tp4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=True] - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=0-tp2pp2-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=True] @@ -191,12 +197,16 @@ l0_dgx_b200: - accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_fp8_chunked_prefill[tp4ep4-cuda_graph=True] - accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_guided_decoding[xgrammar-mtp_nextn=0] - accuracy/test_disaggregated_serving.py::TestQwen3_30B_A3B::test_mixed_ctx_gen_model[ctxpp2gentp2] + - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[enable_configurable_moe-tp4-trtllm-fp8] + - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[enable_configurable_moe-ep4-trtllm-fp8] + - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[enable_configurable_moe-dp4-trtllm-fp8] - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[tp4-cutlass-auto] - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[tp4-triton-auto] - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[ep4-trtllm-auto] - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[ep4-trtllm-fp8] - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[dp4-cutlass-auto] - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[dp4-triton-auto] + - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus_online_eplb[enable_configurable_moe-fp8] - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[trtllm-one_model-overlap_scheduler] - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[trtllm-one_model-no_overlap_scheduler] - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[trtllm-two_model-no_overlap_scheduler] diff --git a/tests/unittest/_torch/modules/conftest.py b/tests/unittest/_torch/modules/conftest.py new file mode 100644 index 0000000000..c7e85eeeea --- /dev/null +++ b/tests/unittest/_torch/modules/conftest.py @@ -0,0 +1,118 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# TEMPORARY FILE - Will be removed after MoE refactor is complete. +# +# Background: +# The `enable_configurable_moe` parameter is a temporary measure during the MoE +# refactor. The old and new MoE flows will coexist for a period of time. To avoid +# large-scale changes to the existing test lists, we handle the test ID cleanup +# here. Once the refactor is complete and all tests use ConfigurableMoE by default, +# this file will no longer be needed and should be deleted. +# +# Two-phase approach: +# 1. pytest_sessionstart: Convert clean test names in CLI args back to original +# format so pytest can find tests during collection. +# 2. pytest_collection_modifyitems: Clean up the collected test IDs for display +# and waive matching. +import re + +# Test functions that use enable_configurable_moe parameter and need ID conversion +TESTS_WITH_CONFIGURABLE_MOE = [ + "test_fused_moe_nvfp4", + "test_fused_moe_mxfp4_mxfp8", + "test_fused_moe_w4a8_nvfp4_fp8", + "test_fused_moe_wfp4a16", +] + + +def _convert_clean_to_original_moe_test_id(test_id): + """Convert clean MoE test ID back to original format for pytest collection. + + Example: "test_fused_moe.py::test_foo[TRTLLM-dtype0]" -> "test_fused_moe.py::test_foo[-TRTLLM-dtype0]" + + This is needed because the `enable_configurable_moe` parameter uses empty string + as ID when value is 0, resulting in test IDs like "test_foo[-TRTLLM-dtype0]". + We clean these up in pytest_collection_modifyitems, but pytest filters tests + during collection using the original IDs. So when user runs with clean test name, + we need to convert it back to match the original. + """ + if "test_fused_moe.py" not in test_id: + return test_id + + # Match pattern like "test_name[params]" and add leading dash after "[" + # But only if params don't already start with "-" or "enable_configurable_moe" + match = re.search(r"\[([^\]]+)\]", test_id) + if match: + params = match.group(1) + # Skip if already has leading dash or starts with enable_configurable_moe + if not params.startswith("-") and not params.startswith("enable_configurable_moe"): + # Add leading dash to params + new_params = "-" + params + test_id = test_id.replace(f"[{params}]", f"[{new_params}]") + + return test_id + + +def pytest_sessionstart(session): + """Convert clean MoE test IDs in config.args to original format for collection. + + This is needed because pytest filters tests during collection using original IDs. + When user runs with clean test name, we convert it back to match the original. + """ + args = session.config.args + for i, arg in enumerate(args): + if "test_fused_moe.py" in arg and "[" in arg: + # Only apply conversion to specific tests that use enable_configurable_moe + should_convert = any(test_name in arg for test_name in TESTS_WITH_CONFIGURABLE_MOE) + if should_convert: + args[i] = _convert_clean_to_original_moe_test_id(arg) + + +def pytest_collection_modifyitems(items): + """Clean up test IDs by removing leading/trailing dashes from parameter IDs. + + This is needed because `enable_configurable_moe` parameter can be empty, + resulting in ugly test IDs like "test_foo[-True]" or "test_foo[--abc]". + We clean these up to "test_foo[True]" or "test_foo[abc]" so that: + 1. Test names in waive files and test lists remain unchanged + 2. Test reports look cleaner + + This runs BEFORE the global conftest applies waives (due to hookwrapper). + """ + for item in items: + if "test_fused_moe.py" in item.nodeid and "[" in item.nodeid: + # Only apply cleanup to specific tests that use enable_configurable_moe + should_cleanup = any( + test_name in item.nodeid for test_name in TESTS_WITH_CONFIGURABLE_MOE + ) + if should_cleanup: + original_nodeid = item.nodeid + original_name = item.name + nodeid = item.nodeid + name = item.name + + # Clean up leading/trailing dashes in nodeid + nodeid = nodeid.replace("[-", "[") + nodeid = nodeid.replace("-]", "]") + + # Clean up leading/trailing dashes in name + name = name.replace("[-", "[") + name = name.replace("-]", "]") + + if nodeid != original_nodeid: + item._nodeid = nodeid + if name != original_name: + item.name = name diff --git a/tests/unittest/_torch/modules/test_fused_moe.py b/tests/unittest/_torch/modules/test_fused_moe.py index 4ba09fa79c..1db2aab76a 100644 --- a/tests/unittest/_torch/modules/test_fused_moe.py +++ b/tests/unittest/_torch/modules/test_fused_moe.py @@ -1356,7 +1356,20 @@ def test_fused_moe_fp8_blockwise_cute_dsl_multi_gpu(ep_size, routing_method, @pytest.mark.parametrize("moe_backend", [ pytest.param("TRTLLM", marks=skip_blackwell_geforce), "CUTLASS", "CUTEDSL" ]) -def test_fused_moe_nvfp4(dtype, moe_backend): +@pytest.mark.parametrize("enable_configurable_moe", [0, 1], + ids=lambda x: "" + if x == 0 else "enable_configurable_moe") +def test_fused_moe_nvfp4(dtype, moe_backend, enable_configurable_moe, mocker): + + if enable_configurable_moe == 1 and moe_backend != "TRTLLM": + pytest.skip("ENABLE_CONFIGURABLE_MOE=1, only TRTLLM backend is enabled") + + mocker.patch.dict( + os.environ, { + "ENABLE_CONFIGURABLE_MOE": + "1" + if enable_configurable_moe == 1 and moe_backend == "TRTLLM" else "0" + }) if moe_backend == "TRTLLM" and dtype == torch.float16: pytest.skip("TRTLLM NVFP4 MoE backend does not support float16 yet") @@ -1515,7 +1528,20 @@ def test_fused_moe_nvfp4(dtype, moe_backend): @pytest.mark.parametrize( "moe_backend", [pytest.param("TRTLLM", marks=skip_blackwell_geforce), "CUTLASS"]) -def test_fused_moe_w4a8_nvfp4_fp8(moe_backend): +@pytest.mark.parametrize("enable_configurable_moe", [0, 1], + ids=lambda x: "" + if x == 0 else "enable_configurable_moe") +def test_fused_moe_w4a8_nvfp4_fp8(moe_backend, enable_configurable_moe, mocker): + if enable_configurable_moe == 1 and moe_backend != "TRTLLM": + pytest.skip("ENABLE_CONFIGURABLE_MOE=1, only TRTLLM backend is enabled") + + mocker.patch.dict( + os.environ, { + "ENABLE_CONFIGURABLE_MOE": + "1" + if enable_configurable_moe == 1 and moe_backend == "TRTLLM" else "0" + }) + dtype = torch.bfloat16 mapping = Mapping() mapping.rank = mpi_rank() @@ -1930,7 +1956,21 @@ def test_fused_moe_w4afp8(dtype, weight_loading_mode): @pytest.mark.parametrize("hidden_unpadded", [64, 192, 256]) @pytest.mark.parametrize("seq_len", [8, 128]) @pytest.mark.parametrize("bias", [True, False]) -def test_fused_moe_mxfp4_mxfp8(moe_backend, hidden_unpadded, seq_len, bias): +@pytest.mark.parametrize("enable_configurable_moe", [0, 1], + ids=lambda x: "" + if x == 0 else "enable_configurable_moe") +def test_fused_moe_mxfp4_mxfp8(moe_backend, hidden_unpadded, seq_len, bias, + enable_configurable_moe, mocker): + + if enable_configurable_moe == 1 and moe_backend != "TRTLLM": + pytest.skip("ENABLE_CONFIGURABLE_MOE=1, only TRTLLM backend is enabled") + + mocker.patch.dict( + os.environ, { + "ENABLE_CONFIGURABLE_MOE": + "1" + if enable_configurable_moe == 1 and moe_backend == "TRTLLM" else "0" + }) if moe_backend == "CUTLASS" and hidden_unpadded % 128 != 0: pytest.skip() @@ -2191,7 +2231,21 @@ def test_fused_moe_mxfp4_mxfp8(moe_backend, hidden_unpadded, seq_len, bias): marks=[skip_pre_hopper, skip_blackwell, skip_blackwell_geforce]), ], ) -def test_fused_moe_wfp4a16(dtype, hidden_size, moe_backend): +@pytest.mark.parametrize("enable_configurable_moe", [0, 1], + ids=lambda x: "" + if x == 0 else "enable_configurable_moe") +def test_fused_moe_wfp4a16(dtype, hidden_size, moe_backend, + enable_configurable_moe, mocker): + + if enable_configurable_moe == 1 and moe_backend != "TRTLLM": + pytest.skip("ENABLE_CONFIGURABLE_MOE=1, only TRTLLM backend is enabled") + + mocker.patch.dict( + os.environ, { + "ENABLE_CONFIGURABLE_MOE": + "1" + if enable_configurable_moe == 1 and moe_backend == "TRTLLM" else "0" + }) mapping = Mapping() mapping.rank = mpi_rank() From 9bfb6179ec6dab87cf7f42a1c5a4b39dbf2b8d27 Mon Sep 17 00:00:00 2001 From: fredricz-20070104 <226039983+fredricz-20070104@users.noreply.github.com> Date: Mon, 8 Dec 2025 10:41:40 +0800 Subject: [PATCH 08/10] [https://nvbugs/5422621][test] Add GB 200 WIDEEP test case for RCCA 5422621 (#9506) Signed-off-by: FredricZ-2007 <226039983+fredricz-20070104@users.noreply.github.com> --- .../defs/perf/disagg/execution/executor.py | 4 +- .../perf/disagg/execution/subprocess_utils.py | 8 ++ ...1_gen1_dep16_bs64_eplb0_mtp3_ccb-NIXL.yaml | 3 + ...x1_gen1_dep16_bs64_eplb0_mtp3_ccb-UCX.yaml | 3 + ...1_gen1_dep32_bs16_eplb0_mtp3_ccb-NIXL.yaml | 3 + ...x1_gen1_dep32_bs16_eplb0_mtp3_ccb-UCX.yaml | 3 + ...x1_gen4_tep8_bs32_eplb0_mtp0_ccb-NIXL.yaml | 3 + ...tx1_gen4_tep8_bs32_eplb0_mtp0_ccb-UCX.yaml | 3 + ..._gen1_dep16_bs128_eplb0_mtp1_ccb-NIXL.yaml | 3 + ...2_gen1_dep16_bs128_eplb0_mtp1_ccb-UCX.yaml | 3 + ...x1_gen1_tep8_bs32_eplb0_mtp0_ccb-NIXL.yaml | 95 +++++++++++++++ ...tx1_gen1_tep8_bs32_eplb0_mtp0_ccb-UCX.yaml | 95 +++++++++++++++ ...1_gen1_dep32_bs32_eplb0_mtp0_ccb-NIXL.yaml | 3 + ...x1_gen1_dep32_bs32_eplb0_mtp0_ccb-UCX.yaml | 3 + ...x1_gen4_tep8_bs32_eplb0_mtp0_ccb-NIXL.yaml | 3 + ...tx1_gen4_tep8_bs32_eplb0_mtp0_ccb-UCX.yaml | 3 + ...x1_gen4_tep8_bs32_eplb0_mtp3_ccb-NIXL.yaml | 3 + ...tx1_gen4_tep8_bs32_eplb0_mtp3_ccb-UCX.yaml | 3 + ..._gen1_dep16_bs128_eplb0_mtp3_ccb-NIXL.yaml | 3 + ...2_gen1_dep16_bs128_eplb0_mtp3_ccb-UCX.yaml | 3 + ...x1_gen3_tep8_bs16_eplb0_mtp3_ccb-NIXL.yaml | 3 + ...tx1_gen3_tep8_bs16_eplb0_mtp3_ccb-UCX.yaml | 3 + ...x1_gen3_tep8_bs32_eplb0_mtp0_ccb-NIXL.yaml | 3 + ...tx1_gen3_tep8_bs32_eplb0_mtp0_ccb-UCX.yaml | 3 + ...6_gen1_dep16_bs64_eplb0_mtp0_ccb-NIXL.yaml | 3 + ...x6_gen1_dep16_bs64_eplb0_mtp0_ccb-UCX.yaml | 3 + ...8_gen1_dep32_bs16_eplb0_mtp3_ccb-NIXL.yaml | 3 + ...x8_gen1_dep32_bs16_eplb0_mtp3_ccb-UCX.yaml | 3 + ...en1_dep16_bs128_eplb288_mtp3_ccb-NIXL.yaml | 1 + ...gen1_dep16_bs64_eplb288_mtp3_ccb-NIXL.yaml | 3 + ..._gen1_dep16_bs64_eplb288_mtp3_ccb-UCX.yaml | 3 + ...gen1_dep32_bs16_eplb288_mtp3_ccb-NIXL.yaml | 3 + ..._gen1_dep32_bs16_eplb288_mtp3_ccb-UCX.yaml | 3 + ...en1_dep16_bs128_eplb288_mtp1_ccb-NIXL.yaml | 3 + ...gen1_dep16_bs128_eplb288_mtp1_ccb-UCX.yaml | 3 + ...gen1_dep32_bs32_eplb288_mtp0_ccb-NIXL.yaml | 3 + ..._gen1_dep32_bs32_eplb288_mtp0_ccb-UCX.yaml | 3 + ...en1_dep16_bs128_eplb288_mtp3_ccb-NIXL.yaml | 3 + ...gen1_dep16_bs128_eplb288_mtp3_ccb-UCX.yaml | 3 + ...1_dep48_bs16_eplb288_mtp3_ccb-DEFAULT.yaml | 110 ++++++++++++++++++ ..._dep32_bs128_eplb288_mtp3_ccb-DEFAULT.yaml | 3 + ...gen1_dep16_bs64_eplb288_mtp0_ccb-NIXL.yaml | 3 + ..._gen1_dep16_bs64_eplb288_mtp0_ccb-UCX.yaml | 3 + ...gen1_dep32_bs16_eplb288_mtp3_ccb-NIXL.yaml | 3 + ..._gen1_dep32_bs16_eplb288_mtp3_ccb-UCX.yaml | 3 + .../defs/perf/disagg/testlist/disagg.txt | 2 + .../defs/perf/disagg/testlist/wideep.txt | 1 + .../defs/perf/disagg/utils/common.py | 4 +- tests/integration/defs/pytest.ini | 2 +- 49 files changed, 435 insertions(+), 4 deletions(-) create mode 100644 tests/integration/defs/perf/disagg/test_configs/disagg/perf/Qwen3-235B-A22B-FP8_1k1k_ctx1_gen1_tep8_bs32_eplb0_mtp0_ccb-NIXL.yaml create mode 100644 tests/integration/defs/perf/disagg/test_configs/disagg/perf/Qwen3-235B-A22B-FP8_1k1k_ctx1_gen1_tep8_bs32_eplb0_mtp0_ccb-UCX.yaml create mode 100644 tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_1k1k_ctx2_gen1_dep48_bs16_eplb288_mtp3_ccb-DEFAULT.yaml diff --git a/tests/integration/defs/perf/disagg/execution/executor.py b/tests/integration/defs/perf/disagg/execution/executor.py index 170d2f3e35..fc4d6e785e 100644 --- a/tests/integration/defs/perf/disagg/execution/executor.py +++ b/tests/integration/defs/perf/disagg/execution/executor.py @@ -249,8 +249,8 @@ class JobManager: logger.error(f"Job submission exception: {error_msg}") # Clean up temporary file on exception temp_config_path = test_config.temp_config_path - if os.path.exists(temp_config_path): - os.remove(temp_config_path) + # if os.path.exists(temp_config_path): + # os.remove(temp_config_path) return False, error_msg @staticmethod diff --git a/tests/integration/defs/perf/disagg/execution/subprocess_utils.py b/tests/integration/defs/perf/disagg/execution/subprocess_utils.py index a66d190a16..7034254ee0 100644 --- a/tests/integration/defs/perf/disagg/execution/subprocess_utils.py +++ b/tests/integration/defs/perf/disagg/execution/subprocess_utils.py @@ -12,6 +12,8 @@ No complex process tree cleanup is needed because: import subprocess from typing import Optional +from utils.logger import logger + def exec_cmd(*popenargs, timeout: Optional[float] = None, **kwargs) -> int: """Execute command and return exit code. @@ -54,4 +56,10 @@ def exec_cmd_with_output(*popenargs, timeout: Optional[float] = None, **kwargs) check=True, **kwargs, ) + + # Log stderr if it exists + if result.stderr: + stderr_output = result.stderr.decode() + logger.error(f"Command stderr: {stderr_output}") + return result.stdout.decode() diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep16_bs64_eplb0_mtp3_ccb-NIXL.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep16_bs64_eplb0_mtp3_ccb-NIXL.yaml index 841eb55b6f..f2cb1cb438 100644 --- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep16_bs64_eplb0_mtp3_ccb-NIXL.yaml +++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep16_bs64_eplb0_mtp3_ccb-NIXL.yaml @@ -14,6 +14,7 @@ slurm: account: job_time: 02:00:00 job_name: unified-benchmark + extra_args: "--gres=gpu:4" numa_bind: true benchmark: mode: e2e @@ -36,6 +37,8 @@ environment: trtllm_repo: '' build_wheel: false work_dir: + worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes" + server_env_var: "TRTLLM_SERVER_DISABLE_GC=1" profiling: nsys_on: false accuracy: diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep16_bs64_eplb0_mtp3_ccb-UCX.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep16_bs64_eplb0_mtp3_ccb-UCX.yaml index 8fe7d96229..aea7d01c16 100644 --- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep16_bs64_eplb0_mtp3_ccb-UCX.yaml +++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep16_bs64_eplb0_mtp3_ccb-UCX.yaml @@ -14,6 +14,7 @@ slurm: account: job_time: 02:00:00 job_name: unified-benchmark + extra_args: "--gres=gpu:4" numa_bind: true benchmark: mode: e2e @@ -36,6 +37,8 @@ environment: trtllm_repo: '' build_wheel: false work_dir: + worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes" + server_env_var: "TRTLLM_SERVER_DISABLE_GC=1" profiling: nsys_on: false accuracy: diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep32_bs16_eplb0_mtp3_ccb-NIXL.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep32_bs16_eplb0_mtp3_ccb-NIXL.yaml index 44a93659e7..d49ce13c0d 100644 --- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep32_bs16_eplb0_mtp3_ccb-NIXL.yaml +++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep32_bs16_eplb0_mtp3_ccb-NIXL.yaml @@ -14,6 +14,7 @@ slurm: account: job_time: 02:00:00 job_name: unified-benchmark + extra_args: "--gres=gpu:4" numa_bind: true benchmark: mode: e2e @@ -36,6 +37,8 @@ environment: trtllm_repo: '' build_wheel: false work_dir: + worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes" + server_env_var: "TRTLLM_SERVER_DISABLE_GC=1" profiling: nsys_on: false accuracy: diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep32_bs16_eplb0_mtp3_ccb-UCX.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep32_bs16_eplb0_mtp3_ccb-UCX.yaml index 68c6f5f8c2..2f8c655fc6 100644 --- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep32_bs16_eplb0_mtp3_ccb-UCX.yaml +++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep32_bs16_eplb0_mtp3_ccb-UCX.yaml @@ -14,6 +14,7 @@ slurm: account: job_time: 02:00:00 job_name: unified-benchmark + extra_args: "--gres=gpu:4" numa_bind: true benchmark: mode: e2e @@ -36,6 +37,8 @@ environment: trtllm_repo: '' build_wheel: false work_dir: + worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes" + server_env_var: "TRTLLM_SERVER_DISABLE_GC=1" profiling: nsys_on: false accuracy: diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_ccb-NIXL.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_ccb-NIXL.yaml index cc16b00b1b..786b107f81 100644 --- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_ccb-NIXL.yaml +++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_ccb-NIXL.yaml @@ -14,6 +14,7 @@ slurm: account: job_time: 02:00:00 job_name: unified-benchmark + extra_args: "--gres=gpu:4" numa_bind: true benchmark: mode: e2e @@ -36,6 +37,8 @@ environment: trtllm_repo: '' build_wheel: false work_dir: + worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes" + server_env_var: "TRTLLM_SERVER_DISABLE_GC=1" profiling: nsys_on: false accuracy: diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_ccb-UCX.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_ccb-UCX.yaml index 2751424f82..f118685588 100644 --- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_ccb-UCX.yaml +++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_ccb-UCX.yaml @@ -14,6 +14,7 @@ slurm: account: job_time: 02:00:00 job_name: unified-benchmark + extra_args: "--gres=gpu:4" numa_bind: true benchmark: mode: e2e @@ -36,6 +37,8 @@ environment: trtllm_repo: '' build_wheel: false work_dir: + worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes" + server_env_var: "TRTLLM_SERVER_DISABLE_GC=1" profiling: nsys_on: false accuracy: diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/Qwen3-235B-A22B-FP4_1k1k_ctx2_gen1_dep16_bs128_eplb0_mtp1_ccb-NIXL.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/Qwen3-235B-A22B-FP4_1k1k_ctx2_gen1_dep16_bs128_eplb0_mtp1_ccb-NIXL.yaml index a44c8e3286..4aea781e7d 100644 --- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/Qwen3-235B-A22B-FP4_1k1k_ctx2_gen1_dep16_bs128_eplb0_mtp1_ccb-NIXL.yaml +++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/Qwen3-235B-A22B-FP4_1k1k_ctx2_gen1_dep16_bs128_eplb0_mtp1_ccb-NIXL.yaml @@ -14,6 +14,7 @@ slurm: account: job_time: 02:00:00 job_name: unified-benchmark + extra_args: "--gres=gpu:4" numa_bind: true benchmark: mode: e2e @@ -36,6 +37,8 @@ environment: trtllm_repo: '' build_wheel: false work_dir: + worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes" + server_env_var: "TRTLLM_SERVER_DISABLE_GC=1" profiling: nsys_on: false accuracy: diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/Qwen3-235B-A22B-FP4_1k1k_ctx2_gen1_dep16_bs128_eplb0_mtp1_ccb-UCX.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/Qwen3-235B-A22B-FP4_1k1k_ctx2_gen1_dep16_bs128_eplb0_mtp1_ccb-UCX.yaml index 37f884cee5..8ad78695cb 100644 --- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/Qwen3-235B-A22B-FP4_1k1k_ctx2_gen1_dep16_bs128_eplb0_mtp1_ccb-UCX.yaml +++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/Qwen3-235B-A22B-FP4_1k1k_ctx2_gen1_dep16_bs128_eplb0_mtp1_ccb-UCX.yaml @@ -14,6 +14,7 @@ slurm: account: job_time: 02:00:00 job_name: unified-benchmark + extra_args: "--gres=gpu:4" numa_bind: true benchmark: mode: e2e @@ -36,6 +37,8 @@ environment: trtllm_repo: '' build_wheel: false work_dir: + worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes" + server_env_var: "TRTLLM_SERVER_DISABLE_GC=1" profiling: nsys_on: false accuracy: diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/Qwen3-235B-A22B-FP8_1k1k_ctx1_gen1_tep8_bs32_eplb0_mtp0_ccb-NIXL.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/Qwen3-235B-A22B-FP8_1k1k_ctx1_gen1_tep8_bs32_eplb0_mtp0_ccb-NIXL.yaml new file mode 100644 index 0000000000..7b61b2ed53 --- /dev/null +++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/Qwen3-235B-A22B-FP8_1k1k_ctx1_gen1_tep8_bs32_eplb0_mtp0_ccb-NIXL.yaml @@ -0,0 +1,95 @@ +# nvbugs: 5561153 +metadata: + model_name: Qwen3-235B-A22B-FP8 + precision: fp8 + model_dir_name: Qwen3-235B-A22B-FP8 + supported_gpus: + - GB200 + - GB300 + script_file: disaggr_torch.slurm + benchmark_type: 1k1k + config_index: 21 +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: "--gres=gpu:4" + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: true + multi_round: 8 + benchmark_ratio: 0.8 + streaming: true + concurrency_list: 1 2 4 8 16 36 + input_length: 1024 + output_length: 1024 + dataset_file: +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes" + server_env_var: "TRTLLM_SERVER_DISABLE_GC=1" +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false + model: local-completions + tasks: gsm8k + model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096 +worker_config: + gen: + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: false + pipeline_parallel_size: 1 + max_batch_size: 64 + max_num_tokens: 2048 + max_seq_len: 2051 + cuda_graph_config: + enable_padding: true + max_batch_size: 128 + print_iter_log: true + kv_cache_config: + enable_block_reuse: true + free_gpu_memory_fraction: 0.7 + dtype: fp8 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 2048 + backend: NIXL + stream_interval: 20 + num_postprocess_workers: 4 + allreduce_strategy: MNNVL + disable_overlap_scheduler: false + ctx: + max_batch_size: 32 + max_num_tokens: 2048 + max_seq_len: 2051 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: false + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + kv_cache_config: + enable_block_reuse: true + free_gpu_memory_fraction: 0.7 + dtype: fp8 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 2048 + backend: NIXL diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/Qwen3-235B-A22B-FP8_1k1k_ctx1_gen1_tep8_bs32_eplb0_mtp0_ccb-UCX.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/Qwen3-235B-A22B-FP8_1k1k_ctx1_gen1_tep8_bs32_eplb0_mtp0_ccb-UCX.yaml new file mode 100644 index 0000000000..283755728b --- /dev/null +++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/Qwen3-235B-A22B-FP8_1k1k_ctx1_gen1_tep8_bs32_eplb0_mtp0_ccb-UCX.yaml @@ -0,0 +1,95 @@ +# nvbugs: 5561153 +metadata: + model_name: Qwen3-235B-A22B-FP8 + precision: fp8 + model_dir_name: Qwen3-235B-A22B-FP8 + supported_gpus: + - GB200 + - GB300 + script_file: disaggr_torch.slurm + benchmark_type: 1k1k + config_index: 21 +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: "--gres=gpu:4" + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: true + multi_round: 8 + benchmark_ratio: 0.8 + streaming: true + concurrency_list: 1 2 4 8 16 36 + input_length: 1024 + output_length: 1024 + dataset_file: +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes" + server_env_var: "TRTLLM_SERVER_DISABLE_GC=1" +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false + model: local-completions + tasks: gsm8k + model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096 +worker_config: + gen: + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: false + pipeline_parallel_size: 1 + max_batch_size: 64 + max_num_tokens: 2048 + max_seq_len: 2051 + cuda_graph_config: + enable_padding: true + max_batch_size: 128 + print_iter_log: true + kv_cache_config: + enable_block_reuse: true + free_gpu_memory_fraction: 0.7 + dtype: fp8 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 2048 + backend: UCX + stream_interval: 20 + num_postprocess_workers: 4 + allreduce_strategy: MNNVL + disable_overlap_scheduler: false + ctx: + max_batch_size: 32 + max_num_tokens: 2048 + max_seq_len: 2051 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: false + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + kv_cache_config: + enable_block_reuse: true + free_gpu_memory_fraction: 0.7 + dtype: fp8 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 2048 + backend: UCX diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb0_mtp0_ccb-NIXL.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb0_mtp0_ccb-NIXL.yaml index f9dd57dc2c..33ee191ffd 100644 --- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb0_mtp0_ccb-NIXL.yaml +++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb0_mtp0_ccb-NIXL.yaml @@ -14,6 +14,7 @@ slurm: account: job_time: 02:00:00 job_name: unified-benchmark + extra_args: "--gres=gpu:4" numa_bind: true benchmark: mode: e2e @@ -37,6 +38,8 @@ environment: build_wheel: false trtllm_wheel_path: '' work_dir: + worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes" + server_env_var: "TRTLLM_SERVER_DISABLE_GC=1" profiling: nsys_on: false accuracy: diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb0_mtp0_ccb-UCX.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb0_mtp0_ccb-UCX.yaml index 0a704285d4..12ac8edad0 100644 --- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb0_mtp0_ccb-UCX.yaml +++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb0_mtp0_ccb-UCX.yaml @@ -14,6 +14,7 @@ slurm: account: job_time: 02:00:00 job_name: unified-benchmark + extra_args: "--gres=gpu:4" numa_bind: true benchmark: mode: e2e @@ -36,6 +37,8 @@ environment: trtllm_repo: '' build_wheel: false work_dir: + worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes" + server_env_var: "TRTLLM_SERVER_DISABLE_GC=1" profiling: nsys_on: false accuracy: diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_ccb-NIXL.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_ccb-NIXL.yaml index bda1706561..ab5bd6f719 100644 --- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_ccb-NIXL.yaml +++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_ccb-NIXL.yaml @@ -14,6 +14,7 @@ slurm: account: job_time: 02:00:00 job_name: unified-benchmark + extra_args: "--gres=gpu:4" numa_bind: true benchmark: mode: e2e @@ -36,6 +37,8 @@ environment: trtllm_repo: '' build_wheel: false work_dir: + worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes" + server_env_var: "TRTLLM_SERVER_DISABLE_GC=1" profiling: nsys_on: false accuracy: diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_ccb-UCX.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_ccb-UCX.yaml index 1526472d23..7d8cb97621 100644 --- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_ccb-UCX.yaml +++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_ccb-UCX.yaml @@ -14,6 +14,7 @@ slurm: account: job_time: 02:00:00 job_name: unified-benchmark + extra_args: "--gres=gpu:4" numa_bind: true benchmark: mode: e2e @@ -36,6 +37,8 @@ environment: trtllm_repo: '' build_wheel: false work_dir: + worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes" + server_env_var: "TRTLLM_SERVER_DISABLE_GC=1" profiling: nsys_on: false accuracy: diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp3_ccb-NIXL.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp3_ccb-NIXL.yaml index cb0363fdd3..3f9a7d6a2d 100644 --- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp3_ccb-NIXL.yaml +++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp3_ccb-NIXL.yaml @@ -14,6 +14,7 @@ slurm: account: job_time: 02:00:00 job_name: unified-benchmark + extra_args: "--gres=gpu:4" numa_bind: true benchmark: mode: e2e @@ -36,6 +37,8 @@ environment: trtllm_repo: '' build_wheel: false work_dir: + worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes" + server_env_var: "TRTLLM_SERVER_DISABLE_GC=1" profiling: nsys_on: false accuracy: diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp3_ccb-UCX.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp3_ccb-UCX.yaml index 87142ebc06..f2fd2bc21d 100644 --- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp3_ccb-UCX.yaml +++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp3_ccb-UCX.yaml @@ -14,6 +14,7 @@ slurm: account: job_time: 02:00:00 job_name: unified-benchmark + extra_args: "--gres=gpu:4" numa_bind: true benchmark: mode: e2e @@ -36,6 +37,8 @@ environment: trtllm_repo: '' build_wheel: false work_dir: + worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes" + server_env_var: "TRTLLM_SERVER_DISABLE_GC=1" profiling: nsys_on: false accuracy: diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb0_mtp3_ccb-NIXL.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb0_mtp3_ccb-NIXL.yaml index 4d8565a190..5d9d739d58 100644 --- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb0_mtp3_ccb-NIXL.yaml +++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb0_mtp3_ccb-NIXL.yaml @@ -14,6 +14,7 @@ slurm: account: job_time: 02:00:00 job_name: unified-benchmark + extra_args: "--gres=gpu:4" numa_bind: true benchmark: mode: e2e @@ -36,6 +37,8 @@ environment: trtllm_repo: '' build_wheel: false work_dir: + worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes" + server_env_var: "TRTLLM_SERVER_DISABLE_GC=1" profiling: nsys_on: false accuracy: diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb0_mtp3_ccb-UCX.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb0_mtp3_ccb-UCX.yaml index 8f54e9d2a5..f97137297b 100644 --- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb0_mtp3_ccb-UCX.yaml +++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb0_mtp3_ccb-UCX.yaml @@ -14,6 +14,7 @@ slurm: account: job_time: 02:00:00 job_name: unified-benchmark + extra_args: "--gres=gpu:4" numa_bind: true benchmark: mode: e2e @@ -36,6 +37,8 @@ environment: trtllm_repo: '' build_wheel: false work_dir: + worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes" + server_env_var: "TRTLLM_SERVER_DISABLE_GC=1" profiling: nsys_on: false accuracy: diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs16_eplb0_mtp3_ccb-NIXL.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs16_eplb0_mtp3_ccb-NIXL.yaml index 400eac6cf8..6b9078ac5a 100644 --- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs16_eplb0_mtp3_ccb-NIXL.yaml +++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs16_eplb0_mtp3_ccb-NIXL.yaml @@ -14,6 +14,7 @@ slurm: account: job_time: 02:00:00 job_name: unified-benchmark + extra_args: "--gres=gpu:4" numa_bind: true benchmark: mode: e2e @@ -36,6 +37,8 @@ environment: trtllm_repo: '' build_wheel: false work_dir: + worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes" + server_env_var: "TRTLLM_SERVER_DISABLE_GC=1" profiling: nsys_on: false accuracy: diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs16_eplb0_mtp3_ccb-UCX.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs16_eplb0_mtp3_ccb-UCX.yaml index 0a44b5b2e5..468354c073 100644 --- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs16_eplb0_mtp3_ccb-UCX.yaml +++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs16_eplb0_mtp3_ccb-UCX.yaml @@ -14,6 +14,7 @@ slurm: account: job_time: 02:00:00 job_name: unified-benchmark + extra_args: "--gres=gpu:4" numa_bind: true benchmark: mode: e2e @@ -36,6 +37,8 @@ environment: trtllm_repo: '' build_wheel: false work_dir: + worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes" + server_env_var: "TRTLLM_SERVER_DISABLE_GC=1" profiling: nsys_on: false accuracy: diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs32_eplb0_mtp0_ccb-NIXL.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs32_eplb0_mtp0_ccb-NIXL.yaml index ab31d77167..a970ee6de4 100644 --- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs32_eplb0_mtp0_ccb-NIXL.yaml +++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs32_eplb0_mtp0_ccb-NIXL.yaml @@ -14,6 +14,7 @@ slurm: account: job_time: 02:00:00 job_name: unified-benchmark + extra_args: "--gres=gpu:4" numa_bind: true benchmark: mode: e2e @@ -36,6 +37,8 @@ environment: trtllm_repo: '' build_wheel: false work_dir: + worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes" + server_env_var: "TRTLLM_SERVER_DISABLE_GC=1" profiling: nsys_on: false accuracy: diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs32_eplb0_mtp0_ccb-UCX.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs32_eplb0_mtp0_ccb-UCX.yaml index b0581a7e26..22dc90a06b 100644 --- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs32_eplb0_mtp0_ccb-UCX.yaml +++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs32_eplb0_mtp0_ccb-UCX.yaml @@ -14,6 +14,7 @@ slurm: account: job_time: 02:00:00 job_name: unified-benchmark + extra_args: "--gres=gpu:4" numa_bind: true benchmark: mode: e2e @@ -36,6 +37,8 @@ environment: trtllm_repo: '' build_wheel: false work_dir: + worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes" + server_env_var: "TRTLLM_SERVER_DISABLE_GC=1" profiling: nsys_on: false accuracy: diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx6_gen1_dep16_bs64_eplb0_mtp0_ccb-NIXL.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx6_gen1_dep16_bs64_eplb0_mtp0_ccb-NIXL.yaml index a67db056a2..a54b0dacd5 100644 --- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx6_gen1_dep16_bs64_eplb0_mtp0_ccb-NIXL.yaml +++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx6_gen1_dep16_bs64_eplb0_mtp0_ccb-NIXL.yaml @@ -14,6 +14,7 @@ slurm: account: job_time: 02:00:00 job_name: unified-benchmark + extra_args: "--gres=gpu:4" numa_bind: true benchmark: mode: e2e @@ -36,6 +37,8 @@ environment: trtllm_repo: '' build_wheel: false work_dir: + worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes" + server_env_var: "TRTLLM_SERVER_DISABLE_GC=1" profiling: nsys_on: false accuracy: diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx6_gen1_dep16_bs64_eplb0_mtp0_ccb-UCX.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx6_gen1_dep16_bs64_eplb0_mtp0_ccb-UCX.yaml index 5a6132741f..ab081e78cf 100644 --- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx6_gen1_dep16_bs64_eplb0_mtp0_ccb-UCX.yaml +++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx6_gen1_dep16_bs64_eplb0_mtp0_ccb-UCX.yaml @@ -14,6 +14,7 @@ slurm: account: job_time: 02:00:00 job_name: unified-benchmark + extra_args: "--gres=gpu:4" numa_bind: true benchmark: mode: e2e @@ -36,6 +37,8 @@ environment: trtllm_repo: '' build_wheel: false work_dir: + worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes" + server_env_var: "TRTLLM_SERVER_DISABLE_GC=1" profiling: nsys_on: false accuracy: diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx8_gen1_dep32_bs16_eplb0_mtp3_ccb-NIXL.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx8_gen1_dep32_bs16_eplb0_mtp3_ccb-NIXL.yaml index 58cb470baf..f4a5d3bc3a 100644 --- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx8_gen1_dep32_bs16_eplb0_mtp3_ccb-NIXL.yaml +++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx8_gen1_dep32_bs16_eplb0_mtp3_ccb-NIXL.yaml @@ -14,6 +14,7 @@ slurm: account: job_time: 02:00:00 job_name: unified-benchmark + extra_args: "--gres=gpu:4" numa_bind: true benchmark: mode: e2e @@ -36,6 +37,8 @@ environment: trtllm_repo: '' build_wheel: false work_dir: + worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes" + server_env_var: "TRTLLM_SERVER_DISABLE_GC=1" profiling: nsys_on: false accuracy: diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx8_gen1_dep32_bs16_eplb0_mtp3_ccb-UCX.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx8_gen1_dep32_bs16_eplb0_mtp3_ccb-UCX.yaml index c7ea82b572..9388365383 100644 --- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx8_gen1_dep32_bs16_eplb0_mtp3_ccb-UCX.yaml +++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx8_gen1_dep32_bs16_eplb0_mtp3_ccb-UCX.yaml @@ -14,6 +14,7 @@ slurm: account: job_time: 02:00:00 job_name: unified-benchmark + extra_args: "--gres=gpu:4" numa_bind: true benchmark: mode: e2e @@ -36,6 +37,8 @@ environment: trtllm_repo: '' build_wheel: false work_dir: + worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes" + server_env_var: "TRTLLM_SERVER_DISABLE_GC=1" profiling: nsys_on: false accuracy: diff --git a/tests/integration/defs/perf/disagg/test_configs/wideep/accuracy/deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_ccb-NIXL.yaml b/tests/integration/defs/perf/disagg/test_configs/wideep/accuracy/deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_ccb-NIXL.yaml index ba44ed4c10..1eaf479dcc 100644 --- a/tests/integration/defs/perf/disagg/test_configs/wideep/accuracy/deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_ccb-NIXL.yaml +++ b/tests/integration/defs/perf/disagg/test_configs/wideep/accuracy/deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_ccb-NIXL.yaml @@ -21,6 +21,7 @@ slurm: account: job_time: 02:00:00 job_name: unified-benchmark + extra_args: "--gres=gpu:4" numa_bind: true benchmark: mode: gen_only diff --git a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep16_bs64_eplb288_mtp3_ccb-NIXL.yaml b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep16_bs64_eplb288_mtp3_ccb-NIXL.yaml index 489b4aeacf..60a221d996 100644 --- a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep16_bs64_eplb288_mtp3_ccb-NIXL.yaml +++ b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep16_bs64_eplb288_mtp3_ccb-NIXL.yaml @@ -15,6 +15,7 @@ slurm: account: job_time: 02:00:00 job_name: unified-benchmark + extra_args: "--gres=gpu:4" numa_bind: true benchmark: mode: gen_only @@ -37,6 +38,8 @@ environment: trtllm_repo: '' build_wheel: false work_dir: + worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes" + server_env_var: "TRTLLM_SERVER_DISABLE_GC=1" profiling: nsys_on: false accuracy: diff --git a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep16_bs64_eplb288_mtp3_ccb-UCX.yaml b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep16_bs64_eplb288_mtp3_ccb-UCX.yaml index 5a25ecfc4a..8724f191f5 100644 --- a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep16_bs64_eplb288_mtp3_ccb-UCX.yaml +++ b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep16_bs64_eplb288_mtp3_ccb-UCX.yaml @@ -15,6 +15,7 @@ slurm: account: job_time: 02:00:00 job_name: unified-benchmark + extra_args: "--gres=gpu:4" numa_bind: true benchmark: mode: gen_only @@ -37,6 +38,8 @@ environment: trtllm_repo: '' build_wheel: false work_dir: + worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes" + server_env_var: "TRTLLM_SERVER_DISABLE_GC=1" profiling: nsys_on: false accuracy: diff --git a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep32_bs16_eplb288_mtp3_ccb-NIXL.yaml b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep32_bs16_eplb288_mtp3_ccb-NIXL.yaml index 28c55ce399..738c720650 100644 --- a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep32_bs16_eplb288_mtp3_ccb-NIXL.yaml +++ b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep32_bs16_eplb288_mtp3_ccb-NIXL.yaml @@ -15,6 +15,7 @@ slurm: account: job_time: 02:00:00 job_name: unified-benchmark + extra_args: "--gres=gpu:4" numa_bind: true benchmark: mode: gen_only @@ -37,6 +38,8 @@ environment: trtllm_repo: '' build_wheel: false work_dir: + worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes" + server_env_var: "TRTLLM_SERVER_DISABLE_GC=1" profiling: nsys_on: false accuracy: diff --git a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep32_bs16_eplb288_mtp3_ccb-UCX.yaml b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep32_bs16_eplb288_mtp3_ccb-UCX.yaml index e2a9f70588..af30a466be 100644 --- a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep32_bs16_eplb288_mtp3_ccb-UCX.yaml +++ b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep32_bs16_eplb288_mtp3_ccb-UCX.yaml @@ -15,6 +15,7 @@ slurm: account: job_time: 02:00:00 job_name: unified-benchmark + extra_args: "--gres=gpu:4" numa_bind: true benchmark: mode: gen_only @@ -37,6 +38,8 @@ environment: trtllm_repo: '' build_wheel: false work_dir: + worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes" + server_env_var: "TRTLLM_SERVER_DISABLE_GC=1" profiling: nsys_on: false accuracy: diff --git a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/Qwen3-235B-A22B-FP4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp1_ccb-NIXL.yaml b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/Qwen3-235B-A22B-FP4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp1_ccb-NIXL.yaml index 5cf614ba63..c44b3f6bba 100644 --- a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/Qwen3-235B-A22B-FP4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp1_ccb-NIXL.yaml +++ b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/Qwen3-235B-A22B-FP4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp1_ccb-NIXL.yaml @@ -15,6 +15,7 @@ slurm: account: job_time: 02:00:00 job_name: unified-benchmark + extra_args: "--gres=gpu:4" numa_bind: true benchmark: mode: gen_only @@ -37,6 +38,8 @@ environment: trtllm_repo: '' build_wheel: false work_dir: + worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes" + server_env_var: "TRTLLM_SERVER_DISABLE_GC=1" profiling: nsys_on: false accuracy: diff --git a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/Qwen3-235B-A22B-FP4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp1_ccb-UCX.yaml b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/Qwen3-235B-A22B-FP4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp1_ccb-UCX.yaml index 872e5c7a1c..b7a79d7434 100644 --- a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/Qwen3-235B-A22B-FP4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp1_ccb-UCX.yaml +++ b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/Qwen3-235B-A22B-FP4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp1_ccb-UCX.yaml @@ -15,6 +15,7 @@ slurm: account: job_time: 02:00:00 job_name: unified-benchmark + extra_args: "--gres=gpu:4" numa_bind: true benchmark: mode: gen_only @@ -37,6 +38,8 @@ environment: trtllm_repo: '' build_wheel: false work_dir: + worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes" + server_env_var: "TRTLLM_SERVER_DISABLE_GC=1" profiling: nsys_on: false accuracy: diff --git a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb288_mtp0_ccb-NIXL.yaml b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb288_mtp0_ccb-NIXL.yaml index c6879f3cbb..73a27246c0 100644 --- a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb288_mtp0_ccb-NIXL.yaml +++ b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb288_mtp0_ccb-NIXL.yaml @@ -15,6 +15,7 @@ slurm: account: job_time: 02:00:00 job_name: unified-benchmark + extra_args: "--gres=gpu:4" numa_bind: true benchmark: mode: gen_only @@ -37,6 +38,8 @@ environment: trtllm_repo: '' build_wheel: false work_dir: + worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes" + server_env_var: "TRTLLM_SERVER_DISABLE_GC=1" profiling: nsys_on: false accuracy: diff --git a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb288_mtp0_ccb-UCX.yaml b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb288_mtp0_ccb-UCX.yaml index 2f254163c5..e95e71ca15 100644 --- a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb288_mtp0_ccb-UCX.yaml +++ b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb288_mtp0_ccb-UCX.yaml @@ -15,6 +15,7 @@ slurm: account: job_time: 02:00:00 job_name: unified-benchmark + extra_args: "--gres=gpu:4" numa_bind: true benchmark: mode: gen_only @@ -37,6 +38,8 @@ environment: trtllm_repo: '' build_wheel: false work_dir: + worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes" + server_env_var: "TRTLLM_SERVER_DISABLE_GC=1" profiling: nsys_on: false accuracy: diff --git a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_ccb-NIXL.yaml b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_ccb-NIXL.yaml index 01362f7853..6055421a27 100644 --- a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_ccb-NIXL.yaml +++ b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_ccb-NIXL.yaml @@ -15,6 +15,7 @@ slurm: account: job_time: 02:00:00 job_name: unified-benchmark + extra_args: "--gres=gpu:4" numa_bind: true benchmark: mode: gen_only @@ -37,6 +38,8 @@ environment: trtllm_repo: '' build_wheel: false work_dir: + worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes" + server_env_var: "TRTLLM_SERVER_DISABLE_GC=1" profiling: nsys_on: false accuracy: diff --git a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_ccb-UCX.yaml b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_ccb-UCX.yaml index 17ffdbd15b..6b47c0fc36 100644 --- a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_ccb-UCX.yaml +++ b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_ccb-UCX.yaml @@ -15,6 +15,7 @@ slurm: account: job_time: 02:00:00 job_name: unified-benchmark + extra_args: "--gres=gpu:4" numa_bind: true benchmark: mode: gen_only @@ -37,6 +38,8 @@ environment: trtllm_repo: '' build_wheel: false work_dir: + worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes" + server_env_var: "TRTLLM_SERVER_DISABLE_GC=1" profiling: nsys_on: false accuracy: diff --git a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_1k1k_ctx2_gen1_dep48_bs16_eplb288_mtp3_ccb-DEFAULT.yaml b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_1k1k_ctx2_gen1_dep48_bs16_eplb288_mtp3_ccb-DEFAULT.yaml new file mode 100644 index 0000000000..1e71708f57 --- /dev/null +++ b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_1k1k_ctx2_gen1_dep48_bs16_eplb288_mtp3_ccb-DEFAULT.yaml @@ -0,0 +1,110 @@ +# nvbugs: 5422621 +metadata: + model_name: deepseek-r1-fp4 + precision: fp4 + model_dir_name: DeepSeek-R1-0528-FP4-V2 + supported_gpus: + - GB200 + - GB300 + script_file: disaggr_torch.slurm + benchmark_type: 8k1k + config_index: 7 + dataset_file: datasets/deepseek-r1-8192-1024-200000-ratio-1_for_serve.json +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: "--gres=gpu:4" + numa_bind: true +benchmark: + mode: gen_only + use_nv_sa_benchmark: false + multi_round: 8 + benchmark_ratio: 0.8 + streaming: true + concurrency_list: '12288' + input_length: 1024 + output_length: 1024 + dataset_file: +hardware: + gpus_per_node: 4 + num_ctx_servers: 2 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes" + server_env_var: "TRTLLM_SERVER_DISABLE_GC=1" +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false + model: local-completions + tasks: gsm8k + model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096 +worker_config: + gen: + enable_layerwise_nvtx_marker: true + tensor_parallel_size: 48 + moe_expert_parallel_size: 48 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + pipeline_parallel_size: 1 + max_batch_size: 1024 + max_num_tokens: 1024 + max_seq_len: 2176 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + - 768 + - 1024 + - 2048 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.7 + dtype: fp8 + moe_config: + backend: WIDEEP + load_balancer: + num_slots: 288 + layer_updates_per_iter: 1 + cache_transceiver_config: + max_tokens_in_buffer: 8320 + backend: DEFAULT + stream_interval: 20 + ctx: + enable_layerwise_nvtx_marker: true + max_batch_size: 4 + max_num_tokens: 4480 + max_seq_len: 2176 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 8320 + backend: DEFAULT diff --git a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_8k1k_ctx2_gen1_dep32_bs128_eplb288_mtp3_ccb-DEFAULT.yaml b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_8k1k_ctx2_gen1_dep32_bs128_eplb288_mtp3_ccb-DEFAULT.yaml index 52012de6e2..06900691bc 100644 --- a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_8k1k_ctx2_gen1_dep32_bs128_eplb288_mtp3_ccb-DEFAULT.yaml +++ b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_8k1k_ctx2_gen1_dep32_bs128_eplb288_mtp3_ccb-DEFAULT.yaml @@ -15,6 +15,7 @@ slurm: account: job_time: 02:00:00 job_name: disaggr-test + extra_args: "--gres=gpu:4" numa_bind: true hardware: gpus_per_node: 4 @@ -37,6 +38,8 @@ environment: trtllm_repo: '' build_wheel: false work_dir: + worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes" + server_env_var: "TRTLLM_SERVER_DISABLE_GC=1" profiling: nsys_on: false accuracy: diff --git a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_8k1k_ctx6_gen1_dep16_bs64_eplb288_mtp0_ccb-NIXL.yaml b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_8k1k_ctx6_gen1_dep16_bs64_eplb288_mtp0_ccb-NIXL.yaml index 216c6f8899..13572a6049 100644 --- a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_8k1k_ctx6_gen1_dep16_bs64_eplb288_mtp0_ccb-NIXL.yaml +++ b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_8k1k_ctx6_gen1_dep16_bs64_eplb288_mtp0_ccb-NIXL.yaml @@ -15,6 +15,7 @@ slurm: account: job_time: 02:00:00 job_name: unified-benchmark + extra_args: "--gres=gpu:4" numa_bind: true benchmark: mode: gen_only @@ -37,6 +38,8 @@ environment: trtllm_repo: '' build_wheel: false work_dir: + worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes" + server_env_var: "TRTLLM_SERVER_DISABLE_GC=1" profiling: nsys_on: false accuracy: diff --git a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_8k1k_ctx6_gen1_dep16_bs64_eplb288_mtp0_ccb-UCX.yaml b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_8k1k_ctx6_gen1_dep16_bs64_eplb288_mtp0_ccb-UCX.yaml index 104e567525..30e6152302 100644 --- a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_8k1k_ctx6_gen1_dep16_bs64_eplb288_mtp0_ccb-UCX.yaml +++ b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_8k1k_ctx6_gen1_dep16_bs64_eplb288_mtp0_ccb-UCX.yaml @@ -15,6 +15,7 @@ slurm: account: job_time: 02:00:00 job_name: unified-benchmark + extra_args: "--gres=gpu:4" numa_bind: true benchmark: mode: gen_only @@ -37,6 +38,8 @@ environment: trtllm_repo: '' build_wheel: false work_dir: + worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes" + server_env_var: "TRTLLM_SERVER_DISABLE_GC=1" profiling: nsys_on: false accuracy: diff --git a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_8k1k_ctx8_gen1_dep32_bs16_eplb288_mtp3_ccb-NIXL.yaml b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_8k1k_ctx8_gen1_dep32_bs16_eplb288_mtp3_ccb-NIXL.yaml index 9aa8e38d15..55391a698c 100644 --- a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_8k1k_ctx8_gen1_dep32_bs16_eplb288_mtp3_ccb-NIXL.yaml +++ b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_8k1k_ctx8_gen1_dep32_bs16_eplb288_mtp3_ccb-NIXL.yaml @@ -15,6 +15,7 @@ slurm: account: job_time: 02:00:00 job_name: unified-benchmark + extra_args: "--gres=gpu:4" numa_bind: true benchmark: mode: gen_only @@ -37,6 +38,8 @@ environment: trtllm_repo: '' build_wheel: false work_dir: + worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes" + server_env_var: "TRTLLM_SERVER_DISABLE_GC=1" profiling: nsys_on: false accuracy: diff --git a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_8k1k_ctx8_gen1_dep32_bs16_eplb288_mtp3_ccb-UCX.yaml b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_8k1k_ctx8_gen1_dep32_bs16_eplb288_mtp3_ccb-UCX.yaml index d60df72d59..62301215e9 100644 --- a/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_8k1k_ctx8_gen1_dep32_bs16_eplb288_mtp3_ccb-UCX.yaml +++ b/tests/integration/defs/perf/disagg/test_configs/wideep/perf/deepseek-r1-fp4_8k1k_ctx8_gen1_dep32_bs16_eplb288_mtp3_ccb-UCX.yaml @@ -15,6 +15,7 @@ slurm: account: job_time: 02:00:00 job_name: unified-benchmark + extra_args: "--gres=gpu:4" numa_bind: true benchmark: mode: gen_only @@ -37,6 +38,8 @@ environment: trtllm_repo: '' build_wheel: false work_dir: + worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes" + server_env_var: "TRTLLM_SERVER_DISABLE_GC=1" profiling: nsys_on: false accuracy: diff --git a/tests/integration/defs/perf/disagg/testlist/disagg.txt b/tests/integration/defs/perf/disagg/testlist/disagg.txt index 8f06a99961..bd0c10fb4c 100644 --- a/tests/integration/defs/perf/disagg/testlist/disagg.txt +++ b/tests/integration/defs/perf/disagg/testlist/disagg.txt @@ -16,6 +16,8 @@ test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_ test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs16_eplb0_mtp3_ccb-UCX] test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_Qwen3-235B-A22B-FP4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_ccb-NIXL] test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_Qwen3-235B-A22B-FP4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_ccb-UCX] +test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_Qwen3-235B-A22B-FP8_1k1k_ctx1_gen1_tep8_bs32_eplb0_mtp0_ccb-NIXL] +test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_Qwen3-235B-A22B-FP8_1k1k_ctx1_gen1_tep8_bs32_eplb0_mtp0_ccb-UCX] # test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep16_bs64_eplb0_mtp3_ccb-NIXL] # test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_Qwen3-235B-A22B-FP4_1k1k_ctx2_gen1_dep16_bs128_eplb0_mtp1_ccb-NIXL] # test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_Qwen3-235B-A22B-FP4_1k1k_ctx2_gen1_dep16_bs128_eplb0_mtp1_ccb-UCX] diff --git a/tests/integration/defs/perf/disagg/testlist/wideep.txt b/tests/integration/defs/perf/disagg/testlist/wideep.txt index 4f1064ec68..55e7bd4721 100644 --- a/tests/integration/defs/perf/disagg/testlist/wideep.txt +++ b/tests/integration/defs/perf/disagg/testlist/wideep.txt @@ -7,6 +7,7 @@ test_disagg.py::TestDisaggBenchmark::test_benchmark[wideep_perf_deepseek-r1-fp4_ test_disagg.py::TestDisaggBenchmark::test_benchmark[wideep_perf_deepseek-r1-fp4_8k1k_ctx6_gen1_dep16_bs64_eplb288_mtp0_ccb-UCX] test_disagg.py::TestDisaggBenchmark::test_benchmark[wideep_perf_deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_ccb-NIXL] test_disagg.py::TestDisaggBenchmark::test_benchmark[wideep_perf_deepseek-r1-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb288_mtp0_ccb-NIXL] +test_disagg.py::TestDisaggBenchmark::test_benchmark[wideep_perf_deepseek-r1-fp4_1k1k_ctx2_gen1_dep48_bs16_eplb288_mtp3_ccb-DEFAULT] # test_disagg.py::TestDisaggBenchmark::test_benchmark[wideep_perf_Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep16_bs64_eplb288_mtp3_ccb-NIXL] # test_disagg.py::TestDisaggBenchmark::test_benchmark[wideep_perf_Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep32_bs16_eplb288_mtp3_ccb-NIXL] # test_disagg.py::TestDisaggBenchmark::test_benchmark[wideep_perf_Qwen3-235B-A22B-FP4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp1_ccb-NIXL] diff --git a/tests/integration/defs/perf/disagg/utils/common.py b/tests/integration/defs/perf/disagg/utils/common.py index b62ef4341c..9fb72fbacb 100644 --- a/tests/integration/defs/perf/disagg/utils/common.py +++ b/tests/integration/defs/perf/disagg/utils/common.py @@ -1,6 +1,7 @@ """Disaggregated Benchmark Configuration.""" import os +from datetime import datetime SESSION_COLLECT_CMD_TYPE = "session_collect" @@ -169,7 +170,8 @@ def extract_config_fields(config_data: dict) -> dict: # Generate derived fields dep_flag = "dep" if gen_enable_dp else "tep" - log_base = f"{isl}-{osl}" + date_prefix = datetime.now().strftime("%Y%m%d") + log_base = f"{date_prefix}/{isl}-{osl}" context_dir = ( f"ctx{ctx_num}_gen{gen_num}_{dep_flag}{gen_tp_size}_" f"batch{gen_batch_size}_eplb{eplb_slots}_mtp{mtp_size}" diff --git a/tests/integration/defs/pytest.ini b/tests/integration/defs/pytest.ini index 6d6237d581..dcca875f03 100644 --- a/tests/integration/defs/pytest.ini +++ b/tests/integration/defs/pytest.ini @@ -6,7 +6,7 @@ junit_family=legacy addopts = --ignore-glob="*perf/test_perf.py" --ignore-glob="*perf/disagg/*" --ignore-glob="*test_list_validation.py" --ignore-glob="*llm-test-workspace*" --durations=0 -W ignore::DeprecationWarning pythonpath = ../../../examples/auto_deploy -norecursedirs = ./triton/perf +norecursedirs = ./triton/perf ./perf/disagg markers = skip_less_device: skip when less device detected than the declared skip_less_mpi_world_size: skip when less mpi world size detected than the declared From 8b9ab9a701b706cf934b0b8cb680d2f7e85f57c2 Mon Sep 17 00:00:00 2001 From: Yukun He <23156053+hyukn@users.noreply.github.com> Date: Mon, 8 Dec 2025 10:47:21 +0800 Subject: [PATCH 09/10] [None][fix] Fix two tuning cache miss issues. (#9743) Signed-off-by: Yukun He <23156053+hyukn@users.noreply.github.com> --- tensorrt_llm/_torch/autotuner.py | 7 ++++++- tensorrt_llm/_torch/custom_ops/torch_custom_ops.py | 4 ++-- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/tensorrt_llm/_torch/autotuner.py b/tensorrt_llm/_torch/autotuner.py index feecf3d174..609efd1055 100644 --- a/tensorrt_llm/_torch/autotuner.py +++ b/tensorrt_llm/_torch/autotuner.py @@ -511,6 +511,11 @@ class AutoTunerProfilingCache: cache = {} cache_data = serializable_cache["cache_data"] + def lists_to_tuples(obj): + if isinstance(obj, list): + return tuple(lists_to_tuples(x) for x in obj) + return obj + for key_str, value in cache_data.items(): # Reconstruct the tuple key safely try: @@ -521,7 +526,7 @@ class AutoTunerProfilingCache: continue runner_id = value["runner_id"] - tactic = value["tactic"] + tactic = lists_to_tuples(value["tactic"]) min_time = value["min_time"] cache[key] = (runner_id, tactic, min_time) diff --git a/tensorrt_llm/_torch/custom_ops/torch_custom_ops.py b/tensorrt_llm/_torch/custom_ops/torch_custom_ops.py index 003f137883..fe09758cfe 100644 --- a/tensorrt_llm/_torch/custom_ops/torch_custom_ops.py +++ b/tensorrt_llm/_torch/custom_ops/torch_custom_ops.py @@ -486,10 +486,10 @@ class CublasLtFP4GemmRunner(TunableRunner): self.cublaslt_runner = CublasLtFP4GemmRunner.runner_dict[instance_key] def unique_id(self): - return hash(( + return ( self.to_userbuffers, self.output_dtype, - )) + ) def get_valid_tactics(self, inputs: List[torch.Tensor], profile: OptimizationProfile, **kwargs) -> List[int]: From 03f89d7aa40f77c6c4d5b9f1416d1ddca2a72427 Mon Sep 17 00:00:00 2001 From: TensorRT LLM <90828364+tensorrt-cicd@users.noreply.github.com> Date: Mon, 8 Dec 2025 03:07:46 +0000 Subject: [PATCH 10/10] [None][infra] Check in most recent lock file from nightly pipeline Signed-off-by: TensorRT LLM <90828364+tensorrt-cicd@users.noreply.github.com> --- security_scanning/examples/models/core/mllama/poetry.lock | 2 +- security_scanning/metadata.json | 4 ++-- security_scanning/poetry.lock | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/security_scanning/examples/models/core/mllama/poetry.lock b/security_scanning/examples/models/core/mllama/poetry.lock index c58e7c12b5..11e0ed3ccb 100644 --- a/security_scanning/examples/models/core/mllama/poetry.lock +++ b/security_scanning/examples/models/core/mllama/poetry.lock @@ -708,7 +708,7 @@ files = [ [[package]] name = "nvidia-modelopt" version = "0.21.1" -description = "Nvidia Model Optimizer: a unified model optimization and deployment toolkit." +description = "Nvidia TensorRT Model Optimizer: a unified model optimization and deployment toolkit." optional = false python-versions = "<3.13,>=3.8" files = [ diff --git a/security_scanning/metadata.json b/security_scanning/metadata.json index d160caaae0..0c24542544 100644 --- a/security_scanning/metadata.json +++ b/security_scanning/metadata.json @@ -1,4 +1,4 @@ { - "commit_hash": "e4c707845ff58fcc0b1d87afb4dd0e64885c780a", - "timestamp": "2025-12-07T02:39:14Z" + "commit_hash": "8e27ce7084d9fab1051e88fc945732e59689761b", + "timestamp": "2025-12-08T02:39:23Z" } diff --git a/security_scanning/poetry.lock b/security_scanning/poetry.lock index e5959abf84..18ed93657e 100644 --- a/security_scanning/poetry.lock +++ b/security_scanning/poetry.lock @@ -2793,7 +2793,7 @@ files = [ [[package]] name = "nvidia-modelopt" version = "0.37.0" -description = "Nvidia Model Optimizer: a unified model optimization and deployment toolkit." +description = "Nvidia TensorRT Model Optimizer: a unified model optimization and deployment toolkit." optional = false python-versions = "<3.13,>=3.10" files = [