Merge branch 'main' into fix_spec_gate

2026-02-11 05:23:38 +08:00 · 2026-01-08 22:24:39 -08:00 · 2026-01-08 22:24:39 -08:00 · 8b5a8c2304
commit 8b5a8c2304
parent 65d9fb4668 c5331e6dbb
29 changed files with 87 additions and 76 deletions
--- a/examples/layer_wise_benchmarks/run.py
+++ b/examples/layer_wise_benchmarks/run.py
@ -10,10 +10,11 @@ import torch
 import yaml

 from tensorrt_llm._torch.autotuner import AutoTuner, autotune
+from tensorrt_llm._torch.distributed import MPIDist, TorchDist
 from tensorrt_llm._torch.modules.fused_moe.fused_moe_cutlass import CutlassFusedMoE
 from tensorrt_llm._torch.modules.fused_moe.interface import AlltoallMethodType
 from tensorrt_llm._torch.modules.multi_stream_utils import with_multi_stream
-from tensorrt_llm._utils import local_mpi_rank, mpi_rank, mpi_world_size
+from tensorrt_llm._utils import local_mpi_rank, mpi_disabled, mpi_rank, mpi_world_size
 from tensorrt_llm.logger import logger
 from tensorrt_llm.tools.layer_wise_benchmarks import BalanceMethod, get_runner_cls, mark_ranges

@ -173,6 +174,8 @@ run_pack = runner.create_run_pack(
 )
 if args.enable_autotuner:
    cache_path = os.getenv("TLLM_AUTOTUNER_CACHE_PATH") or None
+    dist = TorchDist(mapping=mapping) if mpi_disabled() else MPIDist(mapping=mapping)
+    AutoTuner.get().setup_distributed_state(mapping, dist)
    with autotune(cache_path=cache_path):
        run_pack()
 else:
--- a/security_scanning/examples/auto_deploy/poetry.lock
+++ b/security_scanning/examples/auto_deploy/poetry.lock
@ -3705,13 +3705,13 @@ files = [

 [[package]]
 name = "werkzeug"
-version = "3.1.4"
+version = "3.1.5"
 description = "The comprehensive WSGI web application library."
 optional = false
 python-versions = ">=3.9"
 files = [
-    {file = "werkzeug-3.1.4-py3-none-any.whl", hash = "sha256:2ad50fb9ed09cc3af22c54698351027ace879a0b60a3b5edf5730b2f7d876905"},
-    {file = "werkzeug-3.1.4.tar.gz", hash = "sha256:cd3cd98b1b92dc3b7b3995038826c68097dcb16f9baa63abe35f20eafeb9fe5e"},
+    {file = "werkzeug-3.1.5-py3-none-any.whl", hash = "sha256:5111e36e91086ece91f93268bb39b4a35c1e6f1feac762c9c822ded0a4e322dc"},
+    {file = "werkzeug-3.1.5.tar.gz", hash = "sha256:6a548b0e88955dd07ccb25539d7d0cc97417ee9e179677d22c7041c8f078ce67"},
 ]

 [package.dependencies]
--- a/security_scanning/examples/models/contrib/stdit/poetry.lock
+++ b/security_scanning/examples/models/contrib/stdit/poetry.lock
@ -166,15 +166,15 @@ test-tox-coverage = ["coverage (>=5.5)"]

 [[package]]
 name = "bitsandbytes"
-version = "0.49.0"
+version = "0.49.1"
 description = "k-bit optimizers and matrix multiplication routines."
 optional = false
 python-versions = ">=3.10"
 files = [
-    {file = "bitsandbytes-0.49.0-py3-none-macosx_14_0_arm64.whl", hash = "sha256:17d5b57e6d51b78bcfc07da0e93db061181b25bffabfafe101dd9b75c2710872"},
-    {file = "bitsandbytes-0.49.0-py3-none-manylinux_2_24_aarch64.whl", hash = "sha256:7e69951b4d207a676986fce967544d9599f23518d0f09d478295996aeff377c2"},
-    {file = "bitsandbytes-0.49.0-py3-none-manylinux_2_24_x86_64.whl", hash = "sha256:0c46cdef50b3174463b6bdf13715c9f1f00b360be3626e3c5d2f8d226af2cf3f"},
-    {file = "bitsandbytes-0.49.0-py3-none-win_amd64.whl", hash = "sha256:57a327c6d65f7eda32eb8d416ef8e44d2415c2e7b4fdb735896abd04171ae696"},
+    {file = "bitsandbytes-0.49.1-py3-none-macosx_14_0_arm64.whl", hash = "sha256:9de01d4384b6c71ef9ab052b98457dc0e4fff8fe06ab14833b5b712700deb005"},
+    {file = "bitsandbytes-0.49.1-py3-none-manylinux_2_24_aarch64.whl", hash = "sha256:acd4730a0db3762d286707f4a3bc1d013d21dd5f0e441900da57ec4198578d4e"},
+    {file = "bitsandbytes-0.49.1-py3-none-manylinux_2_24_x86_64.whl", hash = "sha256:e7940bf32457dc2e553685285b2a86e82f5ec10b2ae39776c408714f9ae6983c"},
+    {file = "bitsandbytes-0.49.1-py3-none-win_amd64.whl", hash = "sha256:6ead0763f4beb936f9a09acb49ec094a259180906fc0605d9ca0617249c3c798"},
 ]

 [package.dependencies]
--- a/security_scanning/examples/models/core/qwen/poetry.lock
+++ b/security_scanning/examples/models/core/qwen/poetry.lock
@ -2927,30 +2927,30 @@ six = ">=1.14.0"

 [[package]]
 name = "ruff"
-version = "0.14.10"
+version = "0.14.11"
 description = "An extremely fast Python linter and code formatter, written in Rust."
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "ruff-0.14.10-py3-none-linux_armv6l.whl", hash = "sha256:7a3ce585f2ade3e1f29ec1b92df13e3da262178df8c8bdf876f48fa0e8316c49"},
-    {file = "ruff-0.14.10-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:674f9be9372907f7257c51f1d4fc902cb7cf014b9980152b802794317941f08f"},
-    {file = "ruff-0.14.10-py3-none-macosx_11_0_arm64.whl", hash = "sha256:d85713d522348837ef9df8efca33ccb8bd6fcfc86a2cde3ccb4bc9d28a18003d"},
-    {file = "ruff-0.14.10-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6987ebe0501ae4f4308d7d24e2d0fe3d7a98430f5adfd0f1fead050a740a3a77"},
-    {file = "ruff-0.14.10-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:16a01dfb7b9e4eee556fbfd5392806b1b8550c9b4a9f6acd3dbe6812b193c70a"},
-    {file = "ruff-0.14.10-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7165d31a925b7a294465fa81be8c12a0e9b60fb02bf177e79067c867e71f8b1f"},
-    {file = "ruff-0.14.10-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:c561695675b972effb0c0a45db233f2c816ff3da8dcfbe7dfc7eed625f218935"},
-    {file = "ruff-0.14.10-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:4bb98fcbbc61725968893682fd4df8966a34611239c9fd07a1f6a07e7103d08e"},
-    {file = "ruff-0.14.10-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f24b47993a9d8cb858429e97bdf8544c78029f09b520af615c1d261bf827001d"},
-    {file = "ruff-0.14.10-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:59aabd2e2c4fd614d2862e7939c34a532c04f1084476d6833dddef4afab87e9f"},
-    {file = "ruff-0.14.10-py3-none-manylinux_2_31_riscv64.whl", hash = "sha256:213db2b2e44be8625002dbea33bb9c60c66ea2c07c084a00d55732689d697a7f"},
-    {file = "ruff-0.14.10-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:b914c40ab64865a17a9a5b67911d14df72346a634527240039eb3bd650e5979d"},
-    {file = "ruff-0.14.10-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:1484983559f026788e3a5c07c81ef7d1e97c1c78ed03041a18f75df104c45405"},
-    {file = "ruff-0.14.10-py3-none-musllinux_1_2_i686.whl", hash = "sha256:c70427132db492d25f982fffc8d6c7535cc2fd2c83fc8888f05caaa248521e60"},
-    {file = "ruff-0.14.10-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:5bcf45b681e9f1ee6445d317ce1fa9d6cba9a6049542d1c3d5b5958986be8830"},
-    {file = "ruff-0.14.10-py3-none-win32.whl", hash = "sha256:104c49fc7ab73f3f3a758039adea978869a918f31b73280db175b43a2d9b51d6"},
-    {file = "ruff-0.14.10-py3-none-win_amd64.whl", hash = "sha256:466297bd73638c6bdf06485683e812db1c00c7ac96d4ddd0294a338c62fdc154"},
-    {file = "ruff-0.14.10-py3-none-win_arm64.whl", hash = "sha256:e51d046cf6dda98a4633b8a8a771451107413b0f07183b2bef03f075599e44e6"},
-    {file = "ruff-0.14.10.tar.gz", hash = "sha256:9a2e830f075d1a42cd28420d7809ace390832a490ed0966fe373ba288e77aaf4"},
+    {file = "ruff-0.14.11-py3-none-linux_armv6l.whl", hash = "sha256:f6ff2d95cbd335841a7217bdfd9c1d2e44eac2c584197ab1385579d55ff8830e"},
+    {file = "ruff-0.14.11-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:6f6eb5c1c8033680f4172ea9c8d3706c156223010b8b97b05e82c59bdc774ee6"},
+    {file = "ruff-0.14.11-py3-none-macosx_11_0_arm64.whl", hash = "sha256:f2fc34cc896f90080fca01259f96c566f74069a04b25b6205d55379d12a6855e"},
+    {file = "ruff-0.14.11-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:53386375001773ae812b43205d6064dae49ff0968774e6befe16a994fc233caa"},
+    {file = "ruff-0.14.11-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:a697737dce1ca97a0a55b5ff0434ee7205943d4874d638fe3ae66166ff46edbe"},
+    {file = "ruff-0.14.11-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6845ca1da8ab81ab1dce755a32ad13f1db72e7fba27c486d5d90d65e04d17b8f"},
+    {file = "ruff-0.14.11-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:e36ce2fd31b54065ec6f76cb08d60159e1b32bdf08507862e32f47e6dde8bcbf"},
+    {file = "ruff-0.14.11-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:590bcc0e2097ecf74e62a5c10a6b71f008ad82eb97b0a0079e85defe19fe74d9"},
+    {file = "ruff-0.14.11-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:53fe71125fc158210d57fe4da26e622c9c294022988d08d9347ec1cf782adafe"},
+    {file = "ruff-0.14.11-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a35c9da08562f1598ded8470fcfef2afb5cf881996e6c0a502ceb61f4bc9c8a3"},
+    {file = "ruff-0.14.11-py3-none-manylinux_2_31_riscv64.whl", hash = "sha256:0f3727189a52179393ecf92ec7057c2210203e6af2676f08d92140d3e1ee72c1"},
+    {file = "ruff-0.14.11-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:eb09f849bd37147a789b85995ff734a6c4a095bed5fd1608c4f56afc3634cde2"},
+    {file = "ruff-0.14.11-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:c61782543c1231bf71041461c1f28c64b961d457d0f238ac388e2ab173d7ecb7"},
+    {file = "ruff-0.14.11-py3-none-musllinux_1_2_i686.whl", hash = "sha256:82ff352ea68fb6766140381748e1f67f83c39860b6446966cff48a315c3e2491"},
+    {file = "ruff-0.14.11-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:728e56879df4ca5b62a9dde2dd0eb0edda2a55160c0ea28c4025f18c03f86984"},
+    {file = "ruff-0.14.11-py3-none-win32.whl", hash = "sha256:337c5dd11f16ee52ae217757d9b82a26400be7efac883e9e852646f1557ed841"},
+    {file = "ruff-0.14.11-py3-none-win_amd64.whl", hash = "sha256:f981cea63d08456b2c070e64b79cb62f951aa1305282974d4d5216e6e0178ae6"},
+    {file = "ruff-0.14.11-py3-none-win_arm64.whl", hash = "sha256:649fb6c9edd7f751db276ef42df1f3df41c38d67d199570ae2a7bd6cbc3590f0"},
+    {file = "ruff-0.14.11.tar.gz", hash = "sha256:f6dc463bfa5c07a59b1ff2c3b9767373e541346ea105503b4c0369c520a66958"},
 ]

 [[package]]
--- a/security_scanning/examples/ray_orchestrator/poetry.lock
+++ b/security_scanning/examples/ray_orchestrator/poetry.lock
@ -550,13 +550,13 @@ files = [

 [[package]]
 name = "google-api-core"
-version = "2.28.1"
+version = "2.29.0"
 description = "Google API client core library"
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "google_api_core-2.28.1-py3-none-any.whl", hash = "sha256:4021b0f8ceb77a6fb4de6fde4502cecab45062e66ff4f2895169e0b35bc9466c"},
-    {file = "google_api_core-2.28.1.tar.gz", hash = "sha256:2b405df02d68e68ce0fbc138559e6036559e685159d148ae5861013dc201baf8"},
+    {file = "google_api_core-2.29.0-py3-none-any.whl", hash = "sha256:d30bc60980daa36e314b5d5a3e5958b0200cb44ca8fa1be2b614e932b75a3ea9"},
+    {file = "google_api_core-2.29.0.tar.gz", hash = "sha256:84181be0f8e6b04006df75ddfe728f24489f0af57c96a529ff7cf45bc28797f7"},
 ]

 [package.dependencies]
--- a/security_scanning/metadata.json
+++ b/security_scanning/metadata.json
@ -1,4 +1,4 @@
 {
-  "commit_hash": "b85c447ceb1ff91c5d4df6b71de2256a5fabfe9d",
-  "timestamp": "2026-01-08T02:42:38Z"
+  "commit_hash": "56e779d09f5ba965e65022da1d66554e1e57d7f2",
+  "timestamp": "2026-01-09T02:46:11Z"
 }
--- a/security_scanning/poetry.lock
+++ b/security_scanning/poetry.lock
@ -435,19 +435,19 @@ urllib3 = ">=1.25.3,<3"

 [[package]]
 name = "build"
-version = "1.3.0"
+version = "1.4.0"
 description = "A simple, correct Python build frontend"
 optional = false
 python-versions = ">=3.9"
 files = [
-    {file = "build-1.3.0-py3-none-any.whl", hash = "sha256:7145f0b5061ba90a1500d60bd1b13ca0a8a4cebdd0cc16ed8adf1c0e739f43b4"},
-    {file = "build-1.3.0.tar.gz", hash = "sha256:698edd0ea270bde950f53aed21f3a0135672206f3911e0176261a31e0e07b397"},
+    {file = "build-1.4.0-py3-none-any.whl", hash = "sha256:6a07c1b8eb6f2b311b96fcbdbce5dab5fe637ffda0fd83c9cac622e927501596"},
+    {file = "build-1.4.0.tar.gz", hash = "sha256:f1b91b925aa322be454f8330c6fb48b465da993d1e7e7e6fa35027ec49f3c936"},
 ]

 [package.dependencies]
 colorama = {version = "*", markers = "os_name == \"nt\""}
 importlib-metadata = {version = ">=4.6", markers = "python_full_version < \"3.10.2\""}
-packaging = ">=19.1"
+packaging = ">=24.0"
 pyproject_hooks = "*"
 tomli = {version = ">=1.1.0", markers = "python_version < \"3.11\""}

@ -6339,4 +6339,4 @@ type = ["pytest-mypy"]
 [metadata]
 lock-version = "2.0"
 python-versions = ">=3.10,<3.13"
-content-hash = "dab9694d64d1c91b512eb62bbd31da9d0cdb8c93e99941a7022f2f46aea905e3"
+content-hash = "2404aeeb4f38fe2d44201d1cfff64be5c94ba66ec6a5ee783643e4d77c7eb905"
--- a/security_scanning/pyproject.toml
+++ b/security_scanning/pyproject.toml
@ -9,7 +9,7 @@ readme = "README.md"
 [tool.poetry.dependencies]
 python = ">=3.10,<3.13"
 accelerate = ">=1.7.0"
-build = "^1.3.0"
+build = "^1.4.0"
 colored = "^2.3.1"
 cuda-python = ">=13"
 diffusers = ">=0.27.0"
--- a/tensorrt_llm/_torch/auto_deploy/transform/library/sharding.py
+++ b/tensorrt_llm/_torch/auto_deploy/transform/library/sharding.py
@ -646,7 +646,6 @@ class FP8EPShardingInfo(EPShardingInfo, QuantizationShardingMixin):
            gm,
            node,
            self.config,
-            self.mlp_type,
            scale_names=self.scale_names(),
        )

@ -664,7 +663,7 @@ class NVFP4EPShardingInfo(EPShardingInfo, QuantizationShardingMixin):
        return ["input_scale", "weight_scale", "alpha"]

    def apply(self, gm: GraphModule, node: Node) -> None:
-        _insert_sharded_moe(gm, node, self.config, self.mlp_type, scale_names=self.scale_names())
+        _insert_sharded_moe(gm, node, self.config, scale_names=self.scale_names())


 EP_SHARDING_RULES = [
--- a/tensorrt_llm/_torch/models/modeling_speculative.py
+++ b/tensorrt_llm/_torch/models/modeling_speculative.py
@ -953,14 +953,6 @@ class SpecDecOneEngineForCausalLM(DecoderModelForCausalLM[TModel, TConfig],
            hidden_states = hidden_states[:attn_metadata.num_tokens]

        if self.draft_model is not None:
-            # For one-model speculative decoding with PP, only the last PP rank
-            # has valid hidden_states from the target model. The spec_worker (which
-            # runs the draft model loop) should only run on the last PP rank.
-            # Non-last PP ranks return None and let the PP sync handle the results.
-            mapping = self.model.model_config.mapping
-            if mapping.has_pp() and not mapping.is_last_pp_rank():
-                return None
-
            # get logits
            logits = self.logits_processor.forward(
                hidden_states[spec_metadata.gather_ids],
--- a/tensorrt_llm/_torch/speculative/interface.py
+++ b/tensorrt_llm/_torch/speculative/interface.py
@ -141,8 +141,9 @@ class SpeculativeDecodingMode(IntEnum):
            # 1-model has separate logic for handling draft tokens
            return False

+        xqa_supported = get_sm_version() < 120
        return not issubclass(attention_backend,
-                              TrtllmAttention) or get_sm_version() < 90
+                              TrtllmAttention) or not xqa_supported

    def attention_need_spec_dec_mode(
            self,
@ -161,14 +162,16 @@ class SpeculativeDecodingMode(IntEnum):
        """
        is_trtllm_attention = issubclass(attention_backend, TrtllmAttention)

-        # Always use the multi-token query mode for 1-model.
+        # Always use the multi-token query mode for 1-model if the kernels are available.
+        xqa_supported = get_sm_version() < 120
+        use_case_1 = self.use_one_engine() and xqa_supported
        # For 2-model, we need to enable it when we process multiple tokens at once. This occurs with
        # the target model (verification) or on the first draft for CDL based speculation.
-        use_case_1 = self.is_eagle3_one_model()
-        use_case_2 = (not is_draft_model or
-                      (spec_resource_manager is not None
-                       and spec_resource_manager.is_first_draft
-                       and use_chain_drafter)) and is_trtllm_attention
+        use_case_2 = not self.use_one_engine() and (
+            not is_draft_model or
+            (spec_resource_manager is not None
+             and spec_resource_manager.is_first_draft
+             and use_chain_drafter)) and is_trtllm_attention

        return use_case_1 or use_case_2

--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx1_pp4_gen6_tep8_bs1_eplb0_mtp3-Default.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx1_pp4_gen6_tep8_bs1_eplb0_mtp3-Default.yaml
@ -49,7 +49,8 @@ worker_config:
    enable_attention_dp: false
    pipeline_parallel_size: 4
    max_batch_size: 1
-    max_num_tokens: 128
+    # mtp_size=3 ⇒ max_num_tokens = 1 * (3 + 1) = 4
+    max_num_tokens: 4
    max_seq_len: 139296
    cuda_graph_config:
      enable_padding: true
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx1_pp8_gen1_dep16_bs1_eplb0_mtp3-Default.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx1_pp8_gen1_dep16_bs1_eplb0_mtp3-Default.yaml
@ -49,7 +49,8 @@ worker_config:
    enable_attention_dp: true
    pipeline_parallel_size: 1
    max_batch_size: 1
-    max_num_tokens: 128
+    # mtp_size=3 ⇒ max_num_tokens = 1 * (3 + 1) = 4
+    max_num_tokens: 4
    max_seq_len: 139296
    cuda_graph_config:
      enable_padding: true
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx1_pp8_gen1_dep8_bs4_eplb0_mtp2-Default.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx1_pp8_gen1_dep8_bs4_eplb0_mtp2-Default.yaml
@ -49,7 +49,8 @@ worker_config:
    enable_attention_dp: true
    pipeline_parallel_size: 1
    max_batch_size: 4
-    max_num_tokens: 128
+    # mtp_size=2 ⇒ max_num_tokens = 4 * (2 + 1) = 12
+    max_num_tokens: 12
    max_seq_len: 139296
    cuda_graph_config:
      enable_padding: true
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx1_pp8_gen1_tep8_bs1_eplb0_mtp3-Default.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx1_pp8_gen1_tep8_bs1_eplb0_mtp3-Default.yaml
@ -49,7 +49,8 @@ worker_config:
    enable_attention_dp: false
    pipeline_parallel_size: 1
    max_batch_size: 1
-    max_num_tokens: 128
+    # mtp_size=3 ⇒ max_num_tokens = 1 * (3 + 1) = 4
+    max_num_tokens: 4
    max_seq_len: 139296
    cuda_graph_config:
      enable_padding: true
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx1_pp8_gen1_tep8_bs2_eplb0_mtp3-Default.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx1_pp8_gen1_tep8_bs2_eplb0_mtp3-Default.yaml
@ -49,7 +49,8 @@ worker_config:
    enable_attention_dp: false
    pipeline_parallel_size: 1
    max_batch_size: 2
-    max_num_tokens: 128
+    # mtp_size=3 ⇒ max_num_tokens = 2 * (3 + 1) = 8
+    max_num_tokens: 8
    max_seq_len: 139296
    cuda_graph_config:
      enable_padding: true
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx1_pp8_gen5_tep8_bs2_eplb0_mtp3-Default.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx1_pp8_gen5_tep8_bs2_eplb0_mtp3-Default.yaml
@ -49,7 +49,7 @@ worker_config:
    enable_attention_dp: false
    pipeline_parallel_size: 1
    max_batch_size: 2
-    max_num_tokens: 128
+    max_num_tokens: 8
    max_seq_len: 139296
    cuda_graph_config:
      enable_padding: true
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx1_pp8_gen7_tep4_bs2_eplb0_mtp2-Default.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx1_pp8_gen7_tep4_bs2_eplb0_mtp2-Default.yaml
@ -49,7 +49,7 @@ worker_config:
    enable_attention_dp: false
    pipeline_parallel_size: 1
    max_batch_size: 2
-    max_num_tokens: 128
+    max_num_tokens: 6
    max_seq_len: 139296
    cuda_graph_config:
      enable_padding: true
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx2_pp4_gen7_tep8_bs2_eplb0_mtp3-Default.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx2_pp4_gen7_tep8_bs2_eplb0_mtp3-Default.yaml
@ -49,7 +49,7 @@ worker_config:
    enable_attention_dp: false
    pipeline_parallel_size: 1
    max_batch_size: 2
-    max_num_tokens: 128
+    max_num_tokens: 8
    max_seq_len: 139296
    cuda_graph_config:
      enable_padding: true
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx3_pp4_gen1_dep8_bs16_eplb0_mtp1-Default.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx3_pp4_gen1_dep8_bs16_eplb0_mtp1-Default.yaml
@ -49,7 +49,7 @@ worker_config:
    enable_attention_dp: true
    pipeline_parallel_size: 1
    max_batch_size: 16
-    max_num_tokens: 128
+    max_num_tokens: 32
    max_seq_len: 139296
    cuda_graph_config:
      enable_padding: true
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx3_pp8_gen1_dep16_bs8_eplb0_mtp2-Default.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx3_pp8_gen1_dep16_bs8_eplb0_mtp2-Default.yaml
@ -49,7 +49,8 @@ worker_config:
    enable_attention_dp: true
    pipeline_parallel_size: 1
    max_batch_size: 8
-    max_num_tokens: 128
+    # mtp_size=2 ⇒ max_num_tokens = 8 * (2 + 1) = 24
+    max_num_tokens: 24
    max_seq_len: 139296
    cuda_graph_config:
      enable_padding: true
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx3_pp8_gen1_dep32_bs2_eplb0_mtp3-Default.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx3_pp8_gen1_dep32_bs2_eplb0_mtp3-Default.yaml
@ -49,7 +49,8 @@ worker_config:
    enable_attention_dp: true
    pipeline_parallel_size: 1
    max_batch_size: 2
-    max_num_tokens: 128
+    # mtp_size=3 ⇒ max_num_tokens = 2 * (3 + 1) = 8
+    max_num_tokens: 8
    max_seq_len: 139296
    cuda_graph_config:
      enable_padding: true
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx5_pp4_gen1_dep16_bs8_eplb0_mtp3-Default.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx5_pp4_gen1_dep16_bs8_eplb0_mtp3-Default.yaml
@ -49,7 +49,8 @@ worker_config:
    enable_attention_dp: true
    pipeline_parallel_size: 1
    max_batch_size: 8
-    max_num_tokens: 128
+    # mtp_size=3 ⇒ max_num_tokens = 8 * (3 + 1) = 32
+    max_num_tokens: 32
    max_seq_len: 139296
    cuda_graph_config:
      enable_padding: true
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx5_pp4_gen1_dep32_bs2_eplb0_mtp3-Default.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx5_pp4_gen1_dep32_bs2_eplb0_mtp3-Default.yaml
@ -49,7 +49,8 @@ worker_config:
    enable_attention_dp: true
    pipeline_parallel_size: 1
    max_batch_size: 2
-    max_num_tokens: 128
+    # mtp_size=3 ⇒ max_num_tokens = 2 * (3 + 1) = 8
+    max_num_tokens: 8
    max_seq_len: 139296
    cuda_graph_config:
      enable_padding: true
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx7_pp4_gen1_dep16_bs16_eplb0_mtp1-Default.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx7_pp4_gen1_dep16_bs16_eplb0_mtp1-Default.yaml
@ -49,7 +49,8 @@ worker_config:
    enable_attention_dp: true
    pipeline_parallel_size: 1
    max_batch_size: 16
-    max_num_tokens: 128
+    # mtp_size=1 ⇒ max_num_tokens = 16 * (1 + 1) = 32
+    max_num_tokens: 32
    max_seq_len: 139296
    cuda_graph_config:
      enable_padding: true
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx8_pp4_gen1_dep16_bs32_eplb0_mtp1-Default.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx8_pp4_gen1_dep16_bs32_eplb0_mtp1-Default.yaml
@ -49,7 +49,8 @@ worker_config:
    enable_attention_dp: true
    pipeline_parallel_size: 1
    max_batch_size: 32
-    max_num_tokens: 128
+    # mtp_size=1 ⇒ max_num_tokens = 32 * (1 + 1) = 64
+    max_num_tokens: 64
    max_seq_len: 139296
    cuda_graph_config:
      enable_padding: true
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx8_pp4_gen1_dep32_bs4_eplb0_mtp3-Default.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx8_pp4_gen1_dep32_bs4_eplb0_mtp3-Default.yaml
@ -49,7 +49,8 @@ worker_config:
    enable_attention_dp: true
    pipeline_parallel_size: 1
    max_batch_size: 4
-    max_num_tokens: 128
+    # mtp_size=3 ⇒ max_num_tokens = 4 * (3 + 1) = 16
+    max_num_tokens: 16
    max_seq_len: 139296
    cuda_graph_config:
      enable_padding: true
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx8_pp4_gen1_dep32_bs8_eplb0_mtp3-Default.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_128k8k_ctx8_pp4_gen1_dep32_bs8_eplb0_mtp3-Default.yaml
@ -49,7 +49,8 @@ worker_config:
    enable_attention_dp: true
    pipeline_parallel_size: 1
    max_batch_size: 8
-    max_num_tokens: 128
+    # mtp_size=3 ⇒ max_num_tokens = 8 * (3 + 1) = 32
+    max_num_tokens: 32
    max_seq_len: 139296
    cuda_graph_config:
      enable_padding: true
--- a/tests/integration/test_lists/waives.txt
+++ b/tests/integration/test_lists/waives.txt
@ -442,7 +442,7 @@ triton_server/test_triton.py::test_gpt_speculative_decoding[gpt-speculative-deco
 accuracy/test_llm_api_pytorch.py::TestLlama3_1_8B_Instruct_RocketKV::test_auto_dtype SKIP (https://nvbugs/5762822)
 unittest/_torch/sampler/test_return_logits.py SKIP (https://nvbugs/5764627)
 examples/serve/test_serve.py::test_config_file_loading[--config] SKIP (https://nvbugs/5754977)
-full:RTXPro6000D/accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=2-tp2pp2-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False] SKIP (https://nvbugs/5740075)
+full:RTXPro6000D/accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=2-tp2pp2-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False] SKIP (https://nvbugspro.nvidia.com/bug/5794313)
 examples/test_ray.py::test_ray_disaggregated_serving[tp2] SKIP (https://nvbugs/5612502)
 unittest/executor/test_rpc_proxy.py SKIP (https://nvbugs/5605741)
 unittest/executor/test_rpc_worker.py SKIP (https://nvbugs/5605741)
@ -467,6 +467,7 @@ accuracy/test_cli_flow.py::TestVicuna7B::test_eagle_2[cuda_graph=True-chunked_co
 accuracy/test_disaggregated_serving.py::TestGPTOSS::test_auto_dtype[True] SKIP (https://nvbugs/5596343)
 accuracy/test_disaggregated_serving.py::TestGPTOSS::test_auto_dtype[False] SKIP (https://nvbugs/5596343)
 accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_eagle3_tp8[eagle3_one_model=False-torch_compile=True] SKIP (https://nvbugs/5775326)
+accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_nvfp4_tp4[torch_compile=False] SKIP (https://nvbugs/5794796)
 accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_fp8[throughput_latency] SKIP (https://nvbugs/5775544)
 accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B_Instruct_2507::test_skip_softmax_attention[target_sparsity_0.5] SKIP (https://nvbugs/5774869)
 accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B_Instruct_2507::test_skip_softmax_attention[target_sparsity_0.0] SKIP (https://nvbugs/5774869)
@ -492,7 +493,6 @@ unittest/_torch/modules/test_fused_moe.py::test_fused_moe_multi_gpu[1-CUTLASS] S
 accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_4gpus[tp4-fp8kv=False-attn_backend=FLASHINFER-torch_compile=False] SKIP (https://nvbugs/5707359)
 accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ctx_pp_gen_tp_asymmetric[GSM8K-gen_tp=1-ctx_pp=2] SKIP (https://nvbugs/5673559)
 accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[ep4-mtp_nextn=0-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False] SKIP (https://nvbugs/5701445)
-accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[pp4-mtp_nextn=2-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] SKIP (https://nvbugs/5740075)
 accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_guided_decoding[llguidance-mtp_nextn=0] SKIP (https://nvbugs/5748600)
 unittest/_torch/ray_orchestrator/multi_gpu/test_multi_instance.py::test_multi_instance[tp2_2instances] SKIP (https://nvbugs/5784566)
 disaggregated/test_auto_scaling.py::test_worker_restart[etcd-round_robin] SKIP (https://nvbugs/5776445)
@ -521,3 +521,4 @@ accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype_with
 accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_ngram SKIP (https://nvbugs/5769815)
 accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_eagle3_tp8[eagle3_one_model=True-torch_compile=False] SKIP (https://nvbugs/5787892)
 accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_eagle3_tp8[eagle3_one_model=False-torch_compile=False] SKIP (https://nvbugs/5787892)
+accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_fp8_chunked_prefill[tp8ep8-cuda_graph=True] SKIP (https://nvbugs/5791839)