mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-02-11 05:23:38 +08:00
Merge branch 'main' into fix_spec_gate
This commit is contained in:
commit
8b5a8c2304
@ -10,10 +10,11 @@ import torch
|
||||
import yaml
|
||||
|
||||
from tensorrt_llm._torch.autotuner import AutoTuner, autotune
|
||||
from tensorrt_llm._torch.distributed import MPIDist, TorchDist
|
||||
from tensorrt_llm._torch.modules.fused_moe.fused_moe_cutlass import CutlassFusedMoE
|
||||
from tensorrt_llm._torch.modules.fused_moe.interface import AlltoallMethodType
|
||||
from tensorrt_llm._torch.modules.multi_stream_utils import with_multi_stream
|
||||
from tensorrt_llm._utils import local_mpi_rank, mpi_rank, mpi_world_size
|
||||
from tensorrt_llm._utils import local_mpi_rank, mpi_disabled, mpi_rank, mpi_world_size
|
||||
from tensorrt_llm.logger import logger
|
||||
from tensorrt_llm.tools.layer_wise_benchmarks import BalanceMethod, get_runner_cls, mark_ranges
|
||||
|
||||
@ -173,6 +174,8 @@ run_pack = runner.create_run_pack(
|
||||
)
|
||||
if args.enable_autotuner:
|
||||
cache_path = os.getenv("TLLM_AUTOTUNER_CACHE_PATH") or None
|
||||
dist = TorchDist(mapping=mapping) if mpi_disabled() else MPIDist(mapping=mapping)
|
||||
AutoTuner.get().setup_distributed_state(mapping, dist)
|
||||
with autotune(cache_path=cache_path):
|
||||
run_pack()
|
||||
else:
|
||||
|
||||
@ -3705,13 +3705,13 @@ files = [
|
||||
|
||||
[[package]]
|
||||
name = "werkzeug"
|
||||
version = "3.1.4"
|
||||
version = "3.1.5"
|
||||
description = "The comprehensive WSGI web application library."
|
||||
optional = false
|
||||
python-versions = ">=3.9"
|
||||
files = [
|
||||
{file = "werkzeug-3.1.4-py3-none-any.whl", hash = "sha256:2ad50fb9ed09cc3af22c54698351027ace879a0b60a3b5edf5730b2f7d876905"},
|
||||
{file = "werkzeug-3.1.4.tar.gz", hash = "sha256:cd3cd98b1b92dc3b7b3995038826c68097dcb16f9baa63abe35f20eafeb9fe5e"},
|
||||
{file = "werkzeug-3.1.5-py3-none-any.whl", hash = "sha256:5111e36e91086ece91f93268bb39b4a35c1e6f1feac762c9c822ded0a4e322dc"},
|
||||
{file = "werkzeug-3.1.5.tar.gz", hash = "sha256:6a548b0e88955dd07ccb25539d7d0cc97417ee9e179677d22c7041c8f078ce67"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
|
||||
@ -166,15 +166,15 @@ test-tox-coverage = ["coverage (>=5.5)"]
|
||||
|
||||
[[package]]
|
||||
name = "bitsandbytes"
|
||||
version = "0.49.0"
|
||||
version = "0.49.1"
|
||||
description = "k-bit optimizers and matrix multiplication routines."
|
||||
optional = false
|
||||
python-versions = ">=3.10"
|
||||
files = [
|
||||
{file = "bitsandbytes-0.49.0-py3-none-macosx_14_0_arm64.whl", hash = "sha256:17d5b57e6d51b78bcfc07da0e93db061181b25bffabfafe101dd9b75c2710872"},
|
||||
{file = "bitsandbytes-0.49.0-py3-none-manylinux_2_24_aarch64.whl", hash = "sha256:7e69951b4d207a676986fce967544d9599f23518d0f09d478295996aeff377c2"},
|
||||
{file = "bitsandbytes-0.49.0-py3-none-manylinux_2_24_x86_64.whl", hash = "sha256:0c46cdef50b3174463b6bdf13715c9f1f00b360be3626e3c5d2f8d226af2cf3f"},
|
||||
{file = "bitsandbytes-0.49.0-py3-none-win_amd64.whl", hash = "sha256:57a327c6d65f7eda32eb8d416ef8e44d2415c2e7b4fdb735896abd04171ae696"},
|
||||
{file = "bitsandbytes-0.49.1-py3-none-macosx_14_0_arm64.whl", hash = "sha256:9de01d4384b6c71ef9ab052b98457dc0e4fff8fe06ab14833b5b712700deb005"},
|
||||
{file = "bitsandbytes-0.49.1-py3-none-manylinux_2_24_aarch64.whl", hash = "sha256:acd4730a0db3762d286707f4a3bc1d013d21dd5f0e441900da57ec4198578d4e"},
|
||||
{file = "bitsandbytes-0.49.1-py3-none-manylinux_2_24_x86_64.whl", hash = "sha256:e7940bf32457dc2e553685285b2a86e82f5ec10b2ae39776c408714f9ae6983c"},
|
||||
{file = "bitsandbytes-0.49.1-py3-none-win_amd64.whl", hash = "sha256:6ead0763f4beb936f9a09acb49ec094a259180906fc0605d9ca0617249c3c798"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
|
||||
@ -2927,30 +2927,30 @@ six = ">=1.14.0"
|
||||
|
||||
[[package]]
|
||||
name = "ruff"
|
||||
version = "0.14.10"
|
||||
version = "0.14.11"
|
||||
description = "An extremely fast Python linter and code formatter, written in Rust."
|
||||
optional = false
|
||||
python-versions = ">=3.7"
|
||||
files = [
|
||||
{file = "ruff-0.14.10-py3-none-linux_armv6l.whl", hash = "sha256:7a3ce585f2ade3e1f29ec1b92df13e3da262178df8c8bdf876f48fa0e8316c49"},
|
||||
{file = "ruff-0.14.10-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:674f9be9372907f7257c51f1d4fc902cb7cf014b9980152b802794317941f08f"},
|
||||
{file = "ruff-0.14.10-py3-none-macosx_11_0_arm64.whl", hash = "sha256:d85713d522348837ef9df8efca33ccb8bd6fcfc86a2cde3ccb4bc9d28a18003d"},
|
||||
{file = "ruff-0.14.10-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6987ebe0501ae4f4308d7d24e2d0fe3d7a98430f5adfd0f1fead050a740a3a77"},
|
||||
{file = "ruff-0.14.10-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:16a01dfb7b9e4eee556fbfd5392806b1b8550c9b4a9f6acd3dbe6812b193c70a"},
|
||||
{file = "ruff-0.14.10-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7165d31a925b7a294465fa81be8c12a0e9b60fb02bf177e79067c867e71f8b1f"},
|
||||
{file = "ruff-0.14.10-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:c561695675b972effb0c0a45db233f2c816ff3da8dcfbe7dfc7eed625f218935"},
|
||||
{file = "ruff-0.14.10-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:4bb98fcbbc61725968893682fd4df8966a34611239c9fd07a1f6a07e7103d08e"},
|
||||
{file = "ruff-0.14.10-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f24b47993a9d8cb858429e97bdf8544c78029f09b520af615c1d261bf827001d"},
|
||||
{file = "ruff-0.14.10-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:59aabd2e2c4fd614d2862e7939c34a532c04f1084476d6833dddef4afab87e9f"},
|
||||
{file = "ruff-0.14.10-py3-none-manylinux_2_31_riscv64.whl", hash = "sha256:213db2b2e44be8625002dbea33bb9c60c66ea2c07c084a00d55732689d697a7f"},
|
||||
{file = "ruff-0.14.10-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:b914c40ab64865a17a9a5b67911d14df72346a634527240039eb3bd650e5979d"},
|
||||
{file = "ruff-0.14.10-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:1484983559f026788e3a5c07c81ef7d1e97c1c78ed03041a18f75df104c45405"},
|
||||
{file = "ruff-0.14.10-py3-none-musllinux_1_2_i686.whl", hash = "sha256:c70427132db492d25f982fffc8d6c7535cc2fd2c83fc8888f05caaa248521e60"},
|
||||
{file = "ruff-0.14.10-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:5bcf45b681e9f1ee6445d317ce1fa9d6cba9a6049542d1c3d5b5958986be8830"},
|
||||
{file = "ruff-0.14.10-py3-none-win32.whl", hash = "sha256:104c49fc7ab73f3f3a758039adea978869a918f31b73280db175b43a2d9b51d6"},
|
||||
{file = "ruff-0.14.10-py3-none-win_amd64.whl", hash = "sha256:466297bd73638c6bdf06485683e812db1c00c7ac96d4ddd0294a338c62fdc154"},
|
||||
{file = "ruff-0.14.10-py3-none-win_arm64.whl", hash = "sha256:e51d046cf6dda98a4633b8a8a771451107413b0f07183b2bef03f075599e44e6"},
|
||||
{file = "ruff-0.14.10.tar.gz", hash = "sha256:9a2e830f075d1a42cd28420d7809ace390832a490ed0966fe373ba288e77aaf4"},
|
||||
{file = "ruff-0.14.11-py3-none-linux_armv6l.whl", hash = "sha256:f6ff2d95cbd335841a7217bdfd9c1d2e44eac2c584197ab1385579d55ff8830e"},
|
||||
{file = "ruff-0.14.11-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:6f6eb5c1c8033680f4172ea9c8d3706c156223010b8b97b05e82c59bdc774ee6"},
|
||||
{file = "ruff-0.14.11-py3-none-macosx_11_0_arm64.whl", hash = "sha256:f2fc34cc896f90080fca01259f96c566f74069a04b25b6205d55379d12a6855e"},
|
||||
{file = "ruff-0.14.11-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:53386375001773ae812b43205d6064dae49ff0968774e6befe16a994fc233caa"},
|
||||
{file = "ruff-0.14.11-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:a697737dce1ca97a0a55b5ff0434ee7205943d4874d638fe3ae66166ff46edbe"},
|
||||
{file = "ruff-0.14.11-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6845ca1da8ab81ab1dce755a32ad13f1db72e7fba27c486d5d90d65e04d17b8f"},
|
||||
{file = "ruff-0.14.11-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:e36ce2fd31b54065ec6f76cb08d60159e1b32bdf08507862e32f47e6dde8bcbf"},
|
||||
{file = "ruff-0.14.11-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:590bcc0e2097ecf74e62a5c10a6b71f008ad82eb97b0a0079e85defe19fe74d9"},
|
||||
{file = "ruff-0.14.11-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:53fe71125fc158210d57fe4da26e622c9c294022988d08d9347ec1cf782adafe"},
|
||||
{file = "ruff-0.14.11-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a35c9da08562f1598ded8470fcfef2afb5cf881996e6c0a502ceb61f4bc9c8a3"},
|
||||
{file = "ruff-0.14.11-py3-none-manylinux_2_31_riscv64.whl", hash = "sha256:0f3727189a52179393ecf92ec7057c2210203e6af2676f08d92140d3e1ee72c1"},
|
||||
{file = "ruff-0.14.11-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:eb09f849bd37147a789b85995ff734a6c4a095bed5fd1608c4f56afc3634cde2"},
|
||||
{file = "ruff-0.14.11-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:c61782543c1231bf71041461c1f28c64b961d457d0f238ac388e2ab173d7ecb7"},
|
||||
{file = "ruff-0.14.11-py3-none-musllinux_1_2_i686.whl", hash = "sha256:82ff352ea68fb6766140381748e1f67f83c39860b6446966cff48a315c3e2491"},
|
||||
{file = "ruff-0.14.11-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:728e56879df4ca5b62a9dde2dd0eb0edda2a55160c0ea28c4025f18c03f86984"},
|
||||
{file = "ruff-0.14.11-py3-none-win32.whl", hash = "sha256:337c5dd11f16ee52ae217757d9b82a26400be7efac883e9e852646f1557ed841"},
|
||||
{file = "ruff-0.14.11-py3-none-win_amd64.whl", hash = "sha256:f981cea63d08456b2c070e64b79cb62f951aa1305282974d4d5216e6e0178ae6"},
|
||||
{file = "ruff-0.14.11-py3-none-win_arm64.whl", hash = "sha256:649fb6c9edd7f751db276ef42df1f3df41c38d67d199570ae2a7bd6cbc3590f0"},
|
||||
{file = "ruff-0.14.11.tar.gz", hash = "sha256:f6dc463bfa5c07a59b1ff2c3b9767373e541346ea105503b4c0369c520a66958"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
|
||||
@ -550,13 +550,13 @@ files = [
|
||||
|
||||
[[package]]
|
||||
name = "google-api-core"
|
||||
version = "2.28.1"
|
||||
version = "2.29.0"
|
||||
description = "Google API client core library"
|
||||
optional = false
|
||||
python-versions = ">=3.7"
|
||||
files = [
|
||||
{file = "google_api_core-2.28.1-py3-none-any.whl", hash = "sha256:4021b0f8ceb77a6fb4de6fde4502cecab45062e66ff4f2895169e0b35bc9466c"},
|
||||
{file = "google_api_core-2.28.1.tar.gz", hash = "sha256:2b405df02d68e68ce0fbc138559e6036559e685159d148ae5861013dc201baf8"},
|
||||
{file = "google_api_core-2.29.0-py3-none-any.whl", hash = "sha256:d30bc60980daa36e314b5d5a3e5958b0200cb44ca8fa1be2b614e932b75a3ea9"},
|
||||
{file = "google_api_core-2.29.0.tar.gz", hash = "sha256:84181be0f8e6b04006df75ddfe728f24489f0af57c96a529ff7cf45bc28797f7"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
{
|
||||
"commit_hash": "b85c447ceb1ff91c5d4df6b71de2256a5fabfe9d",
|
||||
"timestamp": "2026-01-08T02:42:38Z"
|
||||
"commit_hash": "56e779d09f5ba965e65022da1d66554e1e57d7f2",
|
||||
"timestamp": "2026-01-09T02:46:11Z"
|
||||
}
|
||||
|
||||
10
security_scanning/poetry.lock
generated
10
security_scanning/poetry.lock
generated
@ -435,19 +435,19 @@ urllib3 = ">=1.25.3,<3"
|
||||
|
||||
[[package]]
|
||||
name = "build"
|
||||
version = "1.3.0"
|
||||
version = "1.4.0"
|
||||
description = "A simple, correct Python build frontend"
|
||||
optional = false
|
||||
python-versions = ">=3.9"
|
||||
files = [
|
||||
{file = "build-1.3.0-py3-none-any.whl", hash = "sha256:7145f0b5061ba90a1500d60bd1b13ca0a8a4cebdd0cc16ed8adf1c0e739f43b4"},
|
||||
{file = "build-1.3.0.tar.gz", hash = "sha256:698edd0ea270bde950f53aed21f3a0135672206f3911e0176261a31e0e07b397"},
|
||||
{file = "build-1.4.0-py3-none-any.whl", hash = "sha256:6a07c1b8eb6f2b311b96fcbdbce5dab5fe637ffda0fd83c9cac622e927501596"},
|
||||
{file = "build-1.4.0.tar.gz", hash = "sha256:f1b91b925aa322be454f8330c6fb48b465da993d1e7e7e6fa35027ec49f3c936"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
colorama = {version = "*", markers = "os_name == \"nt\""}
|
||||
importlib-metadata = {version = ">=4.6", markers = "python_full_version < \"3.10.2\""}
|
||||
packaging = ">=19.1"
|
||||
packaging = ">=24.0"
|
||||
pyproject_hooks = "*"
|
||||
tomli = {version = ">=1.1.0", markers = "python_version < \"3.11\""}
|
||||
|
||||
@ -6339,4 +6339,4 @@ type = ["pytest-mypy"]
|
||||
[metadata]
|
||||
lock-version = "2.0"
|
||||
python-versions = ">=3.10,<3.13"
|
||||
content-hash = "dab9694d64d1c91b512eb62bbd31da9d0cdb8c93e99941a7022f2f46aea905e3"
|
||||
content-hash = "2404aeeb4f38fe2d44201d1cfff64be5c94ba66ec6a5ee783643e4d77c7eb905"
|
||||
|
||||
@ -9,7 +9,7 @@ readme = "README.md"
|
||||
[tool.poetry.dependencies]
|
||||
python = ">=3.10,<3.13"
|
||||
accelerate = ">=1.7.0"
|
||||
build = "^1.3.0"
|
||||
build = "^1.4.0"
|
||||
colored = "^2.3.1"
|
||||
cuda-python = ">=13"
|
||||
diffusers = ">=0.27.0"
|
||||
|
||||
@ -646,7 +646,6 @@ class FP8EPShardingInfo(EPShardingInfo, QuantizationShardingMixin):
|
||||
gm,
|
||||
node,
|
||||
self.config,
|
||||
self.mlp_type,
|
||||
scale_names=self.scale_names(),
|
||||
)
|
||||
|
||||
@ -664,7 +663,7 @@ class NVFP4EPShardingInfo(EPShardingInfo, QuantizationShardingMixin):
|
||||
return ["input_scale", "weight_scale", "alpha"]
|
||||
|
||||
def apply(self, gm: GraphModule, node: Node) -> None:
|
||||
_insert_sharded_moe(gm, node, self.config, self.mlp_type, scale_names=self.scale_names())
|
||||
_insert_sharded_moe(gm, node, self.config, scale_names=self.scale_names())
|
||||
|
||||
|
||||
EP_SHARDING_RULES = [
|
||||
|
||||
@ -953,14 +953,6 @@ class SpecDecOneEngineForCausalLM(DecoderModelForCausalLM[TModel, TConfig],
|
||||
hidden_states = hidden_states[:attn_metadata.num_tokens]
|
||||
|
||||
if self.draft_model is not None:
|
||||
# For one-model speculative decoding with PP, only the last PP rank
|
||||
# has valid hidden_states from the target model. The spec_worker (which
|
||||
# runs the draft model loop) should only run on the last PP rank.
|
||||
# Non-last PP ranks return None and let the PP sync handle the results.
|
||||
mapping = self.model.model_config.mapping
|
||||
if mapping.has_pp() and not mapping.is_last_pp_rank():
|
||||
return None
|
||||
|
||||
# get logits
|
||||
logits = self.logits_processor.forward(
|
||||
hidden_states[spec_metadata.gather_ids],
|
||||
|
||||
@ -141,8 +141,9 @@ class SpeculativeDecodingMode(IntEnum):
|
||||
# 1-model has separate logic for handling draft tokens
|
||||
return False
|
||||
|
||||
xqa_supported = get_sm_version() < 120
|
||||
return not issubclass(attention_backend,
|
||||
TrtllmAttention) or get_sm_version() < 90
|
||||
TrtllmAttention) or not xqa_supported
|
||||
|
||||
def attention_need_spec_dec_mode(
|
||||
self,
|
||||
@ -161,14 +162,16 @@ class SpeculativeDecodingMode(IntEnum):
|
||||
"""
|
||||
is_trtllm_attention = issubclass(attention_backend, TrtllmAttention)
|
||||
|
||||
# Always use the multi-token query mode for 1-model.
|
||||
# Always use the multi-token query mode for 1-model if the kernels are available.
|
||||
xqa_supported = get_sm_version() < 120
|
||||
use_case_1 = self.use_one_engine() and xqa_supported
|
||||
# For 2-model, we need to enable it when we process multiple tokens at once. This occurs with
|
||||
# the target model (verification) or on the first draft for CDL based speculation.
|
||||
use_case_1 = self.is_eagle3_one_model()
|
||||
use_case_2 = (not is_draft_model or
|
||||
(spec_resource_manager is not None
|
||||
and spec_resource_manager.is_first_draft
|
||||
and use_chain_drafter)) and is_trtllm_attention
|
||||
use_case_2 = not self.use_one_engine() and (
|
||||
not is_draft_model or
|
||||
(spec_resource_manager is not None
|
||||
and spec_resource_manager.is_first_draft
|
||||
and use_chain_drafter)) and is_trtllm_attention
|
||||
|
||||
return use_case_1 or use_case_2
|
||||
|
||||
|
||||
@ -49,7 +49,8 @@ worker_config:
|
||||
enable_attention_dp: false
|
||||
pipeline_parallel_size: 4
|
||||
max_batch_size: 1
|
||||
max_num_tokens: 128
|
||||
# mtp_size=3 ⇒ max_num_tokens = 1 * (3 + 1) = 4
|
||||
max_num_tokens: 4
|
||||
max_seq_len: 139296
|
||||
cuda_graph_config:
|
||||
enable_padding: true
|
||||
|
||||
@ -49,7 +49,8 @@ worker_config:
|
||||
enable_attention_dp: true
|
||||
pipeline_parallel_size: 1
|
||||
max_batch_size: 1
|
||||
max_num_tokens: 128
|
||||
# mtp_size=3 ⇒ max_num_tokens = 1 * (3 + 1) = 4
|
||||
max_num_tokens: 4
|
||||
max_seq_len: 139296
|
||||
cuda_graph_config:
|
||||
enable_padding: true
|
||||
|
||||
@ -49,7 +49,8 @@ worker_config:
|
||||
enable_attention_dp: true
|
||||
pipeline_parallel_size: 1
|
||||
max_batch_size: 4
|
||||
max_num_tokens: 128
|
||||
# mtp_size=2 ⇒ max_num_tokens = 4 * (2 + 1) = 12
|
||||
max_num_tokens: 12
|
||||
max_seq_len: 139296
|
||||
cuda_graph_config:
|
||||
enable_padding: true
|
||||
|
||||
@ -49,7 +49,8 @@ worker_config:
|
||||
enable_attention_dp: false
|
||||
pipeline_parallel_size: 1
|
||||
max_batch_size: 1
|
||||
max_num_tokens: 128
|
||||
# mtp_size=3 ⇒ max_num_tokens = 1 * (3 + 1) = 4
|
||||
max_num_tokens: 4
|
||||
max_seq_len: 139296
|
||||
cuda_graph_config:
|
||||
enable_padding: true
|
||||
|
||||
@ -49,7 +49,8 @@ worker_config:
|
||||
enable_attention_dp: false
|
||||
pipeline_parallel_size: 1
|
||||
max_batch_size: 2
|
||||
max_num_tokens: 128
|
||||
# mtp_size=3 ⇒ max_num_tokens = 2 * (3 + 1) = 8
|
||||
max_num_tokens: 8
|
||||
max_seq_len: 139296
|
||||
cuda_graph_config:
|
||||
enable_padding: true
|
||||
|
||||
@ -49,7 +49,7 @@ worker_config:
|
||||
enable_attention_dp: false
|
||||
pipeline_parallel_size: 1
|
||||
max_batch_size: 2
|
||||
max_num_tokens: 128
|
||||
max_num_tokens: 8
|
||||
max_seq_len: 139296
|
||||
cuda_graph_config:
|
||||
enable_padding: true
|
||||
|
||||
@ -49,7 +49,7 @@ worker_config:
|
||||
enable_attention_dp: false
|
||||
pipeline_parallel_size: 1
|
||||
max_batch_size: 2
|
||||
max_num_tokens: 128
|
||||
max_num_tokens: 6
|
||||
max_seq_len: 139296
|
||||
cuda_graph_config:
|
||||
enable_padding: true
|
||||
|
||||
@ -49,7 +49,7 @@ worker_config:
|
||||
enable_attention_dp: false
|
||||
pipeline_parallel_size: 1
|
||||
max_batch_size: 2
|
||||
max_num_tokens: 128
|
||||
max_num_tokens: 8
|
||||
max_seq_len: 139296
|
||||
cuda_graph_config:
|
||||
enable_padding: true
|
||||
|
||||
@ -49,7 +49,7 @@ worker_config:
|
||||
enable_attention_dp: true
|
||||
pipeline_parallel_size: 1
|
||||
max_batch_size: 16
|
||||
max_num_tokens: 128
|
||||
max_num_tokens: 32
|
||||
max_seq_len: 139296
|
||||
cuda_graph_config:
|
||||
enable_padding: true
|
||||
|
||||
@ -49,7 +49,8 @@ worker_config:
|
||||
enable_attention_dp: true
|
||||
pipeline_parallel_size: 1
|
||||
max_batch_size: 8
|
||||
max_num_tokens: 128
|
||||
# mtp_size=2 ⇒ max_num_tokens = 8 * (2 + 1) = 24
|
||||
max_num_tokens: 24
|
||||
max_seq_len: 139296
|
||||
cuda_graph_config:
|
||||
enable_padding: true
|
||||
|
||||
@ -49,7 +49,8 @@ worker_config:
|
||||
enable_attention_dp: true
|
||||
pipeline_parallel_size: 1
|
||||
max_batch_size: 2
|
||||
max_num_tokens: 128
|
||||
# mtp_size=3 ⇒ max_num_tokens = 2 * (3 + 1) = 8
|
||||
max_num_tokens: 8
|
||||
max_seq_len: 139296
|
||||
cuda_graph_config:
|
||||
enable_padding: true
|
||||
|
||||
@ -49,7 +49,8 @@ worker_config:
|
||||
enable_attention_dp: true
|
||||
pipeline_parallel_size: 1
|
||||
max_batch_size: 8
|
||||
max_num_tokens: 128
|
||||
# mtp_size=3 ⇒ max_num_tokens = 8 * (3 + 1) = 32
|
||||
max_num_tokens: 32
|
||||
max_seq_len: 139296
|
||||
cuda_graph_config:
|
||||
enable_padding: true
|
||||
|
||||
@ -49,7 +49,8 @@ worker_config:
|
||||
enable_attention_dp: true
|
||||
pipeline_parallel_size: 1
|
||||
max_batch_size: 2
|
||||
max_num_tokens: 128
|
||||
# mtp_size=3 ⇒ max_num_tokens = 2 * (3 + 1) = 8
|
||||
max_num_tokens: 8
|
||||
max_seq_len: 139296
|
||||
cuda_graph_config:
|
||||
enable_padding: true
|
||||
|
||||
@ -49,7 +49,8 @@ worker_config:
|
||||
enable_attention_dp: true
|
||||
pipeline_parallel_size: 1
|
||||
max_batch_size: 16
|
||||
max_num_tokens: 128
|
||||
# mtp_size=1 ⇒ max_num_tokens = 16 * (1 + 1) = 32
|
||||
max_num_tokens: 32
|
||||
max_seq_len: 139296
|
||||
cuda_graph_config:
|
||||
enable_padding: true
|
||||
|
||||
@ -49,7 +49,8 @@ worker_config:
|
||||
enable_attention_dp: true
|
||||
pipeline_parallel_size: 1
|
||||
max_batch_size: 32
|
||||
max_num_tokens: 128
|
||||
# mtp_size=1 ⇒ max_num_tokens = 32 * (1 + 1) = 64
|
||||
max_num_tokens: 64
|
||||
max_seq_len: 139296
|
||||
cuda_graph_config:
|
||||
enable_padding: true
|
||||
|
||||
@ -49,7 +49,8 @@ worker_config:
|
||||
enable_attention_dp: true
|
||||
pipeline_parallel_size: 1
|
||||
max_batch_size: 4
|
||||
max_num_tokens: 128
|
||||
# mtp_size=3 ⇒ max_num_tokens = 4 * (3 + 1) = 16
|
||||
max_num_tokens: 16
|
||||
max_seq_len: 139296
|
||||
cuda_graph_config:
|
||||
enable_padding: true
|
||||
|
||||
@ -49,7 +49,8 @@ worker_config:
|
||||
enable_attention_dp: true
|
||||
pipeline_parallel_size: 1
|
||||
max_batch_size: 8
|
||||
max_num_tokens: 128
|
||||
# mtp_size=3 ⇒ max_num_tokens = 8 * (3 + 1) = 32
|
||||
max_num_tokens: 32
|
||||
max_seq_len: 139296
|
||||
cuda_graph_config:
|
||||
enable_padding: true
|
||||
|
||||
@ -442,7 +442,7 @@ triton_server/test_triton.py::test_gpt_speculative_decoding[gpt-speculative-deco
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8B_Instruct_RocketKV::test_auto_dtype SKIP (https://nvbugs/5762822)
|
||||
unittest/_torch/sampler/test_return_logits.py SKIP (https://nvbugs/5764627)
|
||||
examples/serve/test_serve.py::test_config_file_loading[--config] SKIP (https://nvbugs/5754977)
|
||||
full:RTXPro6000D/accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=2-tp2pp2-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False] SKIP (https://nvbugs/5740075)
|
||||
full:RTXPro6000D/accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=2-tp2pp2-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False] SKIP (https://nvbugspro.nvidia.com/bug/5794313)
|
||||
examples/test_ray.py::test_ray_disaggregated_serving[tp2] SKIP (https://nvbugs/5612502)
|
||||
unittest/executor/test_rpc_proxy.py SKIP (https://nvbugs/5605741)
|
||||
unittest/executor/test_rpc_worker.py SKIP (https://nvbugs/5605741)
|
||||
@ -467,6 +467,7 @@ accuracy/test_cli_flow.py::TestVicuna7B::test_eagle_2[cuda_graph=True-chunked_co
|
||||
accuracy/test_disaggregated_serving.py::TestGPTOSS::test_auto_dtype[True] SKIP (https://nvbugs/5596343)
|
||||
accuracy/test_disaggregated_serving.py::TestGPTOSS::test_auto_dtype[False] SKIP (https://nvbugs/5596343)
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_eagle3_tp8[eagle3_one_model=False-torch_compile=True] SKIP (https://nvbugs/5775326)
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_nvfp4_tp4[torch_compile=False] SKIP (https://nvbugs/5794796)
|
||||
accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_fp8[throughput_latency] SKIP (https://nvbugs/5775544)
|
||||
accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B_Instruct_2507::test_skip_softmax_attention[target_sparsity_0.5] SKIP (https://nvbugs/5774869)
|
||||
accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B_Instruct_2507::test_skip_softmax_attention[target_sparsity_0.0] SKIP (https://nvbugs/5774869)
|
||||
@ -492,7 +493,6 @@ unittest/_torch/modules/test_fused_moe.py::test_fused_moe_multi_gpu[1-CUTLASS] S
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_4gpus[tp4-fp8kv=False-attn_backend=FLASHINFER-torch_compile=False] SKIP (https://nvbugs/5707359)
|
||||
accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ctx_pp_gen_tp_asymmetric[GSM8K-gen_tp=1-ctx_pp=2] SKIP (https://nvbugs/5673559)
|
||||
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[ep4-mtp_nextn=0-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False] SKIP (https://nvbugs/5701445)
|
||||
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[pp4-mtp_nextn=2-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] SKIP (https://nvbugs/5740075)
|
||||
accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_guided_decoding[llguidance-mtp_nextn=0] SKIP (https://nvbugs/5748600)
|
||||
unittest/_torch/ray_orchestrator/multi_gpu/test_multi_instance.py::test_multi_instance[tp2_2instances] SKIP (https://nvbugs/5784566)
|
||||
disaggregated/test_auto_scaling.py::test_worker_restart[etcd-round_robin] SKIP (https://nvbugs/5776445)
|
||||
@ -521,3 +521,4 @@ accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype_with
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_ngram SKIP (https://nvbugs/5769815)
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_eagle3_tp8[eagle3_one_model=True-torch_compile=False] SKIP (https://nvbugs/5787892)
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_eagle3_tp8[eagle3_one_model=False-torch_compile=False] SKIP (https://nvbugs/5787892)
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_fp8_chunked_prefill[tp8ep8-cuda_graph=True] SKIP (https://nvbugs/5791839)
|
||||
|
||||
Loading…
Reference in New Issue
Block a user