Merge branch 'main' into fix_spec_gate

Signed-off-by: Zheyu Fu <zheyuf@nvidia.com>
This commit is contained in:
Zheyu Fu 2026-01-06 11:39:03 -08:00 committed by GitHub
commit 1d61d74c4e
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
12 changed files with 237 additions and 157 deletions

View File

@ -287,7 +287,7 @@ def submit_job(config, log_dir, dry_run):
f"--container-image {env_config['container_image']}",
f"--container-name {container_name}",
f"--container-mounts {env_config['container_mount']}",
"--mpi=pmix --overlap",
"--no-container-mount-home --mpi=pmix --overlap",
f"bash {os.path.join(env_config['work_dir'], 'start_worker.sh')}",
server_type,
str(server_id),
@ -313,7 +313,7 @@ def submit_job(config, log_dir, dry_run):
f"--container-name={container_name}",
f"--container-image={env_config['container_image']}",
f"--container-mounts={env_config['container_mount']}",
f"--mpi=pmix --overlap -N 1 -n 1",
f"--no-container-mount-home --mpi=pmix --overlap -N 1 -n 1",
f"bash {env_config['work_dir']}/start_server.sh {os.path.join(log_dir, 'server_config.yaml')} \"{server_env_var}\"",
f"&> {log_dir}/4_output_server.log &",
]

View File

@ -3256,12 +3256,13 @@ def launchTestJobs(pipeline, testFilter)
"DGX_H100-4_GPUs-PyTorch-GptOss-1": ["dgx-h100-x4-oci", "l0_dgx_h100", 1, 1, 4],
"DGX_H100-4_GPUs-PyTorch-Others-1": ["dgx-h100-x4-oci", "l0_dgx_h100", 1, 1, 4],
"DGX_H100-4_GPUs-PyTorch-Ray-1": ["dgx-h100-x4-oci", "l0_dgx_h100", 1, 1, 4],
"B300-PyTorch-1": ["b300-single", "l0_b300", 1, 1],
"DGX_B200-4_GPUs-PyTorch-1": ["b200-x4-lbd", "l0_dgx_b200", 1, 1, 4, 1, true],
"DGX_B200-4_GPUs-PyTorch-Ray-1": ["b200-x4-lbd", "l0_dgx_b200", 1, 1, 4, 1, true],
"DGX_B200-8_GPUs-PyTorch-1": ["b200-x8-lbd", "l0_dgx_b200", 1, 1, 8, 1, true],
"DGX_B200-4_GPUs-PyTorch-Post-Merge-1": ["b200-x4-lbd", "l0_dgx_b200", 1, 2, 4, 1, true],
"DGX_B200-4_GPUs-PyTorch-Post-Merge-2": ["b200-x4-lbd", "l0_dgx_b200", 2, 2, 4, 1, true],
"B300-PyTorch-1": ["b300-single", "l0_b300", 1, 1],
"DGX_B300-4_GPUs-PyTorch-1": ["b300-x4", "l0_dgx_b300", 1, 1, 4],
"DGX_B300-4_GPUs-PyTorch-Post-Merge-1": ["b300-x4", "l0_dgx_b300", 1, 2, 4],
"DGX_B300-4_GPUs-PyTorch-Post-Merge-2": ["b300-x4", "l0_dgx_b300", 2, 2, 4],
// PerfSanity post-merge tests

View File

@ -813,13 +813,14 @@ class DSAtrtllmAttentionMetadata(TrtllmAttentionMetadata):
# Expand schedule metadata buffer (only generation)
kv_lens_expanded = self.kv_lens_expanded_cuda[:num_tokens]
scheduler_metadata_buffer_expanded = get_paged_mqa_logits_metadata(
kv_lens_expanded, tokens_per_block, self.num_sms)
kv_lens_expanded, self.kv_cache_manager.tokens_per_block,
self.num_sms)
self.scheduler_metadata_buffer_expanded.copy_(
scheduler_metadata_buffer_expanded, non_blocking=True)
elif self.max_draft_tokens == 3:
scheduler_metadata_buffer_mtp3 = get_paged_mqa_logits_metadata(
self.kv_lens_cuda[self.num_contexts:self.num_seqs],
tokens_per_block, self.num_sms // 2)
self.kv_cache_manager.tokens_per_block, self.num_sms // 2)
self.scheduler_metadata_buffer_mtp3.copy_(
scheduler_metadata_buffer_mtp3, non_blocking=True)
self.prepare_dense_topk_indices(self.kv_lens_cuda, device=True)

View File

@ -18,7 +18,7 @@ from torch.cuda import device_count
from tensorrt_llm import LLM as PyTorchLLM
from tensorrt_llm import MultimodalEncoder
from tensorrt_llm._tensorrt_engine import LLM
from tensorrt_llm._utils import get_free_port, mpi_rank
from tensorrt_llm._utils import mpi_rank
from tensorrt_llm.executor.utils import LlmLauncherEnvs
from tensorrt_llm.inputs.multimodal import MultimodalServerConfig
from tensorrt_llm.llmapi import (BuildConfig, CapacitySchedulerPolicy,
@ -189,25 +189,12 @@ def launch_server(
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
# If disagg cluster config is provided and port is not specified, try to find a free port, otherwise try to bind to the specified port
assert port > 0 or disagg_cluster_config is not None, "Port must be specified if disagg cluster config is not provided"
if port > 0:
port_retries = 1
else:
port_retries = 100
port = get_free_port()
while port_retries > 0:
try:
s.bind((host, port))
break
except OSError as e:
port_retries -= 1
if port_retries == 0:
raise RuntimeError(
f"Failed to bind socket to {host}:{port}: {e}")
else:
logger.warning(
f"Failed to bind socket to {host}:{port}: {e}, retrying {port_retries}..."
)
port = get_free_port()
try:
s.bind((host, port))
if port == 0:
port = s.getsockname()[1]
except OSError as e:
raise RuntimeError(f"Failed to bind socket to {host}:{port}: {e}")
if backend == 'pytorch':
llm_args.pop("build_config", None)

View File

@ -8,3 +8,14 @@ deepseek-ai/DeepSeek-V3-Lite:
- accuracy: 77.00
- spec_dec_algo: MTP
accuracy: 77.00
google/gemma-3-1b-it:
- quant_algo: FP8
kv_cache_quant_algo: FP8
accuracy: 61.00
GPT-OSS/120B-MXFP4:
- quant_algo: W4A16_MXFP4
spec_dec_algo: Eagle
accuracy: 62.00
- quant_algo: W4A8_MXFP4_MXFP8
spec_dec_algo: Eagle
accuracy: 62.00

View File

@ -1105,6 +1105,37 @@ class TestGemma3_1BInstruct(LlmapiAccuracyTestHarness):
task = MMLU(self.MODEL_NAME)
task.evaluate(llm)
def test_fp8_vswa_reuse(self):
# NOTE: Test with VSWA kv cache config.
kv_cache_config = KvCacheConfig(
enable_block_reuse=True,
max_attention_window=[512, 512, 512, 512, 512, 32768],
)
prequantized_model_path = f"{llm_models_root()}/gemma/gemma-3-1b-it-fp8/"
with LLM(prequantized_model_path,
kv_cache_config=kv_cache_config) as llm:
task = GSM8K(self.MODEL_NAME)
task.evaluate(llm)
task = MMLU(self.MODEL_NAME)
task.evaluate(llm)
@pytest.mark.parametrize("backend", ["xgrammar"])
def test_fp8_guided_decoding_vswa_reuse(self, backend: str, mocker):
mocker.patch.dict(os.environ, {"TRTLLM_XGUIDANCE_LENIENT": "1"})
prequantized_model_path = f"{llm_models_root()}/gemma/gemma-3-1b-it-fp8/"
kv_cache_config = KvCacheConfig(
enable_block_reuse=True,
max_attention_window=[512, 512, 512, 512, 512, 32768],
)
cuda_graph_config = CudaGraphConfig(enable_padding=True)
llm = LLM(prequantized_model_path,
guided_decoding_backend=backend,
kv_cache_config=kv_cache_config,
cuda_graph_config=cuda_graph_config)
with llm:
task = JsonModeEval(self.MODEL_NAME)
task.evaluate(llm)
def test_auto_dtype_vswa_without_reuse(self):
# NOTE: Test with VSWA kv cache config.
kv_cache_config = KvCacheConfig(
@ -2269,6 +2300,7 @@ class TestDeepSeekR1(LlmapiAccuracyTestHarness):
torch.cuda.empty_cache()
@skip_pre_blackwell
@pytest.mark.skip_less_device_memory(95000)
@pytest.mark.parametrize(
"tp_size,pp_size,ep_size,mtp_nextn,fp8kv,attention_dp,cuda_graph,overlap_scheduler,max_batch_size,moe_backend",
[
@ -4460,6 +4492,114 @@ class TestGPTOSS(LlmapiAccuracyTestHarness):
sampling_params=sampling_params,
extra_evaluator_kwargs=extra_evaluator_kwargs)
@pytest.mark.skip_less_device(4)
@pytest.mark.parametrize("one_model", [True, False],
ids=["one_model", "two_model"])
def test_eagle3_vswa_reuse_4gpus(self, one_model, mocker):
MAX_OUTPUT_LEN = 128179
MAX_INPUT_LEN = 32768
mocker.patch.object(GSM8K, "MAX_OUTPUT_LEN", 8192)
mocker.patch.dict(GSM8K.EVALUATE_KWARGS,
{"scores_filter": "exact_match,flexible-extract"})
mocker.patch.object(GPQADiamond, "MAX_OUTPUT_LEN", MAX_OUTPUT_LEN)
mocker.patch.object(GPQADiamond, "MAX_INPUT_LEN", MAX_INPUT_LEN)
pytorch_config = dict(cuda_graph_config=CudaGraphConfig())
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.4,
dtype="auto",
enable_block_reuse=True,
max_attention_window=[128, 32768])
eagle_model_dir = f"{llm_models_root()}/gpt_oss/gpt-oss-120b-Eagle3"
draft_len = 3
spec_config = EagleDecodingConfig(max_draft_len=draft_len,
speculative_model_dir=eagle_model_dir,
eagle3_one_model=one_model,
allow_advanced_sampling=True)
max_seq_len = MAX_INPUT_LEN + MAX_OUTPUT_LEN
llm = LLM(self.MODEL_PATH,
tensor_parallel_size=4,
pipeline_parallel_size=1,
moe_expert_parallel_size=1,
kv_cache_config=kv_cache_config,
max_seq_len=max_seq_len,
speculative_config=spec_config,
**pytorch_config,
enable_attention_dp=False)
with llm:
model_name = "GPT-OSS/120B-MXFP4"
# GSM8K
task = GSM8K(model_name)
task.evaluate(llm,
extra_evaluator_kwargs=self.extra_evaluator_kwargs)
# GPQA Medium Reasoning
task = GPQADiamond(model_name)
chat_template_kwargs = dict(reasoning_effort="medium")
extra_evaluator_kwargs = {
**self.extra_evaluator_kwargs, "chat_template_kwargs":
chat_template_kwargs
}
sampling_params = SamplingParams(
temperature=1.0,
top_p=1.0,
max_tokens=MAX_OUTPUT_LEN,
truncate_prompt_tokens=MAX_INPUT_LEN)
task.evaluate(llm,
sampling_params=sampling_params,
extra_evaluator_kwargs=extra_evaluator_kwargs)
@pytest.mark.skip_less_device(4)
@pytest.mark.parametrize("one_model", [True, False],
ids=["one_model", "two_model"])
def test_eagle3_guided_decoding_4gpus(self, one_model, mocker):
MAX_OUTPUT_LEN = 128179
MAX_INPUT_LEN = 32768
mocker.patch.dict(os.environ, {"TRTLLM_XGUIDANCE_LENIENT": "1"})
mocker.patch.object(GSM8K, "MAX_OUTPUT_LEN", 8192)
mocker.patch.dict(GSM8K.EVALUATE_KWARGS,
{"scores_filter": "exact_match,flexible-extract"})
mocker.patch.object(GPQADiamond, "MAX_OUTPUT_LEN", MAX_OUTPUT_LEN)
mocker.patch.object(GPQADiamond, "MAX_INPUT_LEN", MAX_INPUT_LEN)
pytorch_config = dict(cuda_graph_config=CudaGraphConfig())
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.4,
dtype="auto")
eagle_model_dir = f"{llm_models_root()}/gpt_oss/gpt-oss-120b-Eagle3"
draft_len = 3
spec_config = EagleDecodingConfig(max_draft_len=draft_len,
speculative_model_dir=eagle_model_dir,
eagle3_one_model=one_model,
allow_advanced_sampling=True)
max_seq_len = MAX_INPUT_LEN + MAX_OUTPUT_LEN
llm = LLM(self.MODEL_PATH,
tensor_parallel_size=4,
pipeline_parallel_size=1,
moe_expert_parallel_size=1,
guided_decoding_backend="xgrammar",
kv_cache_config=kv_cache_config,
max_seq_len=max_seq_len,
speculative_config=spec_config,
**pytorch_config,
enable_attention_dp=False)
with llm:
model_name = "GPT-OSS/120B-MXFP4"
task = JsonModeEval(model_name)
task.evaluate(llm)
@pytest.mark.skip_less_device(2)
@pytest.mark.timeout(14400)
@pytest.mark.parametrize("overlap_scheduler", [True, False],

View File

@ -15,6 +15,7 @@
import copy
import os
import platform
import random
import re
import socket
import tempfile
@ -1162,28 +1163,33 @@ def get_free_port_in_ci(max_attempts=100):
Get a free port in the range [CONTAINER_PORT_START, CONTAINER_PORT_START + CONTAINER_PORT_NUM - 1]
If CONTAINER_PORT_START and CONTAINER_PORT_NUM are not set or all ports are already in use, fallback to get_free_port
"""
global PORTS_IN_USE
container_port_start = int(os.environ.get("CONTAINER_PORT_START", -1))
container_port_num = int(os.environ.get("CONTAINER_PORT_NUM", -1))
if container_port_start != -1 and container_port_num != -1:
for i in range(container_port_num):
port = container_port_start + i
if port in PORTS_IN_USE:
continue
available_ports = [
port for port in range(container_port_start, container_port_start +
container_port_num)
if port not in PORTS_IN_USE
]
for _ in range(len(available_ports)):
# Get a random port from the available ports
port = random.choice(available_ports)
# Check if the port is free
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
try:
s.bind(("localhost", port))
# Port is free, add it to the set of used ports
PORTS_IN_USE.add(port)
return port
except OSError:
# Port is not free, try the next port
available_ports.remove(port)
continue
# No port found in the range, try to get a random free port from the system
for i in range(max_attempts):
for _ in range(max_attempts):
port = get_free_port()
if port not in PORTS_IN_USE:
PORTS_IN_USE.add(port)

View File

@ -430,6 +430,8 @@ accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_eagle3_tp8[
accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_eagle3_tp8[eagle3_one_model=False-torch_compile=True]
accuracy/test_llm_api_pytorch.py::TestMistral7B::test_auto_dtype
accuracy/test_llm_api_pytorch.py::TestGemma3_1BInstruct::test_auto_dtype
accuracy/test_llm_api_pytorch.py::TestGemma3_1BInstruct::test_fp8_vswa_reuse
accuracy/test_llm_api_pytorch.py::TestGemma3_1BInstruct::test_fp8_guided_decoding_vswa_reuse[xgrammar]
accuracy/test_llm_api_pytorch.py::TestGemma3_27BInstruct::test_fp8_prequantized
accuracy/test_llm_api_pytorch.py::TestGemma3_27BInstruct::test_auto_dtype
accuracy/test_llm_api_pytorch.py::TestMistralSmall24B::test_auto_dtype
@ -613,6 +615,10 @@ accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[triton-one_model
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[triton-one_model-no_overlap_scheduler]
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[triton-two_model-overlap_scheduler]
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[triton-two_model-no_overlap_scheduler]
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_vswa_reuse_4gpus[one_model]
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_vswa_reuse_4gpus[two_model]
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_guided_decoding_4gpus[one_model]
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_guided_decoding_4gpus[two_model]
accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_auto_dtype[False-False-False]
accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_auto_dtype[True-True-True]
accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ngram

View File

@ -151,6 +151,9 @@ accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[ep4-cutlass-auto]
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[tp4-cutlass-auto]
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4a16[dp4-auto]
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_chunked_prefill[cutlass-auto]
accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm]
accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm_eagle3]
accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus_sm120[throughput_tp8]
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[cutlass-one_model-overlap_scheduler]
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[cutlass-one_model-no_overlap_scheduler]
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[cutlass-two_model-overlap_scheduler]
@ -204,6 +207,18 @@ accuracy/test_disaggregated_serving.py::TestLlama4ScoutInstruct::test_auto_dtype
accuracy/test_disaggregated_serving.py::TestQwen3_8B::test_auto_dtype[False]
accuracy/test_disaggregated_serving.py::TestQwen3_8B::test_auto_dtype[True]
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[trtllm-one_model-overlap_scheduler]
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[trtllm-one_model-no_overlap_scheduler]
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[trtllm-two_model-overlap_scheduler]
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[trtllm-two_model-no_overlap_scheduler]
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[triton-one_model-overlap_scheduler]
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[triton-one_model-no_overlap_scheduler]
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[triton-two_model-overlap_scheduler]
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[triton-two_model-no_overlap_scheduler]
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_vswa_reuse_4gpus[one_model]
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_vswa_reuse_4gpus[two_model]
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_guided_decoding_4gpus[one_model]
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_guided_decoding_4gpus[two_model]
test_e2e.py::test_ptp_quickstart_advanced_mixed_precision
test_e2e.py::test_ptp_quickstart_advanced[Llama3.1-8B-BF16-llama-3.1-model/Meta-Llama-3.1-8B]
test_e2e.py::test_ptp_quickstart_advanced[Llama3.1-8B-FP8-llama-3.1-model/Llama-3.1-8B-Instruct-FP8]

View File

@ -31,11 +31,9 @@ l0_dgx_b300:
- accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_4gpus[pp4-fp8kv=False-attn_backend=TRTLLM-torch_compile=False]
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp4-mtp_nextn=0-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=True]
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp4-mtp_nextn=2-attention_dp=False-cuda_graph=True-overlap_scheduler=True-torch_compile=False]
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[ep4-mtp_nextn=2-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False]
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[ep4-mtp_nextn=2-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=True]
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp2pp2-mtp_nextn=0-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=True]
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp2pp2-mtp_nextn=2-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=True]
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp2pp2-mtp_nextn=2-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False]
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[pp4-mtp_nextn=0-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=True]
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[pp4-mtp_nextn=2-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False]
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=0-tp4-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=True]
@ -44,15 +42,12 @@ l0_dgx_b300:
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=0-tp2pp2-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False]
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=0-pp4-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False]
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=0-pp4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=True]
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=TRTLLM-mtp_nextn=0-tp4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False]
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=TRTLLM-mtp_nextn=0-ep4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False]
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=2-tp4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False]
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=2-ep4-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False]
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=2-tp2pp2-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False]
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=2-tp2pp2-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False]
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=2-pp4-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False]
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=TRTLLM-mtp_nextn=2-ep4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False]
- accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[tep4_latency_moe_trtllm-torch_compile=False]
- accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[tep4_latency_moe_trtllm-torch_compile=True]
- accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[tep4_latency_moe_cutlass-torch_compile=True]
- accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[dep4_latency_moe_trtllm-torch_compile=False]
@ -61,11 +56,9 @@ l0_dgx_b300:
- disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_ucx[DeepSeek-V3-Lite-fp8]
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[tp4-cutlass-fp8]
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[tp4-cutlass-auto]
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[ep4-trtllm-auto]
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[dp4-triton-auto]
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4a16[dp4-auto]
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4a16[dp4-fp8]
- disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_nixl[DeepSeek-V3-Lite-fp8]
- disaggregated/test_disaggregated.py::test_disaggregated_benchmark_on_diff_backends[DeepSeek-V3-Lite-bf16]
- disaggregated/test_disaggregated.py::test_disaggregated_benchmark_on_diff_backends[llama-v3-8b-hf]
- disaggregated/test_disaggregated.py::test_disaggregated_benchmark_on_diff_backends[llama-3.1-8b-instruct-hf-fp8]
@ -75,3 +68,25 @@ l0_dgx_b300:
- accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput_tp4] TIMEOUT (180)
- accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput_pp4_mtp] TIMEOUT (180)
# ------------- AutoDeploy tests ---------------
- condition:
ranges:
system_gpu_count:
gte: 4
lte: 4
wildcards:
gpu:
- '*gb110*'
- '*b300*'
linux_distribution_name: ubuntu*
cpu: x86_64
terms:
stage: pre_merge
backend: pytorch
tests:
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[ep4-mtp_nextn=2-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False]
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp2pp2-mtp_nextn=2-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False]
- accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[tep4_latency_moe_trtllm-torch_compile=False]
- disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_nixl[DeepSeek-V3-Lite-fp8]
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[ep4-trtllm-auto]
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=TRTLLM-mtp_nextn=0-tp4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False]
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=2-tp2pp2-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False]

View File

@ -345,7 +345,6 @@ accuracy/test_llm_api_pytorch_multimodal.py::TestNemotron_Nano_12B_V2_VL::test_a
accuracy/test_llm_api_pytorch.py::TestNemotronNas::test_auto_dtype_tp8 SKIP (https://nvbugs/5673527)
unittest/_torch/auto_deploy/unit/multigpu/test_ad_build_small_multi.py::test_build_ad[meta-llama/Meta-Llama-3.1-8B-Instruct-llm_extra_args0-2] SKIP (https://nvbugs/5680755)
full:H100_PCIe/unittest/llmapi/test_llm_pytorch.py::test_llama_7b_multi_lora_evict_and_reload_lora_gpu_cache SKIP (https://nvbugs/5682551)
unittest/_torch/speculative/test_eagle3.py::test_qwen3_eagle3[True-True-True-True] SKIP (https://nvbugspro.nvidia.com/bug/5749988)
accuracy/test_llm_api_pytorch.py::TestSeedOss_36B::test_auto_dtype SKIP (https://nvbugs/5612438)
accuracy/test_llm_api_autodeploy.py::TestNemotronH::test_auto_dtype[True] SKIP (https://nvbugs/5688721)
accuracy/test_llm_api_autodeploy.py::TestLlama3_1_8B::test_auto_dtype[False-2] SKIP (https://nvbugs/5769712)
@ -419,7 +418,6 @@ accuracy/test_cli_flow.py::TestPhi3_5MiniInstruct::test_auto_dtype SKIP (https:/
unittest/_torch/auto_deploy/unit/singlegpu/models/test_llama4_vlm_patch.py::test_build_run_llama4_vlm SKIP (https://nvbugs/5747878)
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus_online_eplb[fp8kv=True-moe_backend=TRTLLM] SKIP (https://nvbugs/5740377)
cpp/test_multi_gpu.py::TestDisagg::test_symmetric_executor[gpt-2proc-mpi_kvcache-90] SKIP (https://nvbugs/5755941)
accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_guided_decoding[xgrammar-mtp_nextn=0] SKIP (https://nvbugs/5748600)
examples/test_bert.py::test_llm_bert_general[compare_hf-enable_remove_input_padding-use_attention_plugin-enable_context_fmha-tp:1-pp:1-float16-BertForQuestionAnswering-bert/bert-base-cased-squad2] SKIP (https://nvbugs/5608979)
examples/test_bert.py::test_llm_bert_general[compare_hf-enable_remove_input_padding-use_attention_plugin-enable_context_fmha-tp:1-pp:1-float16-RobertaForQuestionAnswering-bert/roberta-base-squad2] SKIP (https://nvbugs/5608979)
examples/test_bert.py::test_llm_bert_general[compare_hf-disable_remove_input_padding-use_attention_plugin-disable_context_fmha-tp:2-pp:1-float16-BertForQuestionAnswering-bert/bert-base-cased-squad2] SKIP (https://nvbugs/5608979)
@ -471,15 +469,12 @@ accuracy/test_llm_api_pytorch_multimodal.py::TestMistralLarge3_675B::test_nvfp4_
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[dp4-trtllm-fp8] SKIP (https://nvbugs/5772396)
full:sm100/accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[dp4-trtllm-auto] SKIP (https://nvbugs/5772396)
accuracy/test_llm_api_pytorch.py::TestGLM4_6::test_nvfp4_2_model_mtp[2model_trtllm] SKIP (https://nvbugs/5772360)
accuracy/test_llm_api_pytorch.py::TestMistralSmall24B::test_fp8 SKIP (https://nvbugs/5772361)
accuracy/test_llm_api_pytorch.py::TestGLM4_6::test_nvfp4_2_model_mtp[2model] SKIP (https://nvbugs/5772993)
test_e2e.py::test_ptp_quickstart_multimodal_chunked_prefill[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503-0.6-image] SKIP (https://nvbugs/5772363)
accuracy/test_llm_api.py::TestLlama3_1_8BInstruct::test_gather_generation_logits_cuda_graph SKIP (https://nvbugs/5772995)
test_e2e.py::test_eagle3_output_consistency_4gpus[Qwen3/Qwen3-30B-A3B-Qwen3/Qwen3-30B-eagle3] SKIP (https://nvbugs/5685010)
full:sm89/accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_tp_pp_symmetric[GSM8K-tp2pp2] SKIP (https://nvbugs/5773047)
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[enable_configurable_moe-dp4-trtllm-fp8] SKIP (https://nvbugs/5773201)
unittest/_torch/auto_deploy/unit/multigpu/transformations/library/test_tp_sharding.py::test_sharding[GQA_Block-torch_dist_all_reduce-True-False-2] SKIP (https://nvbugs/5766982)
test_e2e.py::test_ptp_quickstart_multimodal_kv_cache_reuse[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503-0.6-image] SKIP (https://nvbugs/5773195)
accuracy/test_cli_flow.py::TestVicuna7B::test_eagle_2[cuda_graph=True-chunked_context=True] SKIP (https://nvbugs/5773185)
accuracy/test_cli_flow.py::TestVicuna7B::test_eagle_2[cuda_graph=True-chunked_context=False] SKIP (https://nvbugs/5773185)
accuracy/test_disaggregated_serving.py::TestGPTOSS::test_auto_dtype[True] SKIP (https://nvbugs/5596343)
@ -504,3 +499,18 @@ unittest/_torch/attention/test_flashinfer_star_attn.py::TestStarAttention::test_
unittest/_torch/ray_orchestrator/multi_gpu/test_ops.py::test_reducescatter_pg_op[var_len:True-seqlen:16-hidden:128] SKIP (https://nvbugs/5781383)
cpp/test_e2e.py::test_model[-mamba-86] SKIP (https://nvbugs/5781665)
unittest/llmapi/test_llm_multi_gpu_pytorch.py::test_tinyllama_logits_processor_tp2pp2 SKIP (https://nvbugs/5781731)
accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_tp_pp_symmetric[GSM8K-tp2pp2] SKIP (https://nvbugs/5756008)
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_guided_decoding_4gpus[llguidance] SKIP (https://nvbugs/5756008)
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_4gpus[tp2pp2-fp8kv=False-attn_backend=TRTLLM-torch_compile=False] SKIP (https://nvbugs/5784526)
unittest/_torch/modules/test_fused_moe.py::test_fused_moe_multi_gpu[1-CUTLASS] SKIP (https://nvbugs/5784543)
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_4gpus[tp4-fp8kv=False-attn_backend=FLASHINFER-torch_compile=False] SKIP (https://nvbugs/5707359)
accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ctx_pp_gen_tp_asymmetric[GSM8K-gen_tp=1-ctx_pp=2] SKIP (https://nvbugs/5673559)
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[ep4-mtp_nextn=0-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False] SKIP (https://nvbugs/5701445)
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[pp4-mtp_nextn=2-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] SKIP (https://nvbugs/5740075)
accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_guided_decoding[llguidance-mtp_nextn=0] SKIP (https://nvbugs/5748600)
unittest/_torch/ray_orchestrator/multi_gpu/test_multi_instance.py::test_multi_instance[tp2_2instances] SKIP (https://nvbugs/5784566)
disaggregated/test_auto_scaling.py::test_worker_restart[etcd-round_robin] SKIP (https://nvbugs/5776445)
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_vswa_reuse_4gpus[one_model] SKIP (https://nvbugs/5756028)
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_vswa_reuse_4gpus[two_model] SKIP (https://nvbugs/5756028)
accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_fp8[latency-torch_compile=False] SKIP (https://nvbugs/5785206)
examples/test_gpt.py::test_llm_gpt2_parallel_embedding_2gpu[float16-0] SKIP (https://nvbugs/5784518)

View File

@ -780,117 +780,5 @@ def test_eagle3_cdl_sampling(disable_overlap_scheduler: bool):
llm_spec.shutdown()
@pytest.mark.parametrize(
"enable_block_reuse,use_one_model,enable_chunked_prefill,fp8_target", [
[True, True, True, True],
])
@pytest.mark.high_cuda_memory
def test_qwen3_eagle3(enable_block_reuse: bool, use_one_model: bool,
enable_chunked_prefill: bool, fp8_target: bool):
# Eagle3 one model works with overlap scheduler and block reuse.
total_mem_gb = torch.cuda.get_device_properties(0).total_memory / 1e9
if total_mem_gb < 35:
pytest.skip("Not enough memory to load target + draft model")
use_cuda_graph = True
attn_backend = "TRTLLM"
disable_overlap_scheduler = False
use_chain_drafter = True
multi_batch = False
attention_dp = False
models_path = llm_models_root()
eagle_model_dir = f"{models_path}/Zhi-Create-Qwen3-32B-Eagle3"
target_model_dir = f"{models_path}/Qwen3/Qwen3-32B"
if fp8_target:
target_model_dir = f"{models_path}/Qwen3/Qwen3-32B-FP8/"
# bs > 1 gives non-deterministic when doing IFB. There are slight chances
# that ref and spec does not match 100%
max_batch_size = 4 if multi_batch else 1
max_draft_len = 3
kv_cache_config = KvCacheConfig(enable_block_reuse=enable_block_reuse,
max_tokens=8192)
if fp8_target:
kv_cache_config.dtype = 'fp8'
cuda_graph_config = CudaGraphConfig(
batch_sizes=[i for i in range(1, max_batch_size +
1)]) if use_cuda_graph else None
llm_common_config = dict(
model=target_model_dir,
attn_backend=attn_backend,
disable_overlap_scheduler=disable_overlap_scheduler,
cuda_graph_config=cuda_graph_config,
max_batch_size=max_batch_size,
kv_cache_config=kv_cache_config,
enable_attention_dp=attention_dp,
max_seq_len=8192,
enable_chunked_prefill=enable_chunked_prefill,
)
if enable_chunked_prefill:
# Use a small max_num_tokens so that the chunked prefill path gets exercised.
llm_common_config['max_num_tokens'] = 64
spec_config = EagleDecodingConfig(
max_draft_len=max_draft_len,
speculative_model_dir=eagle_model_dir,
eagle3_one_model=use_one_model,
)
spec_config._allow_chain_drafter = use_chain_drafter
# Create the LLM instance
llm_spec = LLM(**llm_common_config, speculative_config=spec_config)
# Acceptance rate tests
if enable_chunked_prefill:
# Use a long prompt for chunked prefill tests.
prompts = [
"The capital of France is a city of romance, art, fashion, and cuisine. Paris is a must-visit destination for anyone who loves history, architecture, and culture. From the iconic Eiffel Tower to the world-famous Louvre Museum, Paris has something to offer for every interest and age.\nThe city is divided into 20 arrondissements, each with its own unique character and charm. The Latin Quarter is a popular area for students and young travelers, while the Champs-Élysées is a hub for shopping and dining. The Montmartre neighborhood is famous for its bohemian vibe and stunning views of the city.\nParis is also known for its beautiful parks and gardens, such as the Luxembourg Gardens and the Tuileries Garden. The city has a rich history, with landmarks like the Notre-Dame Cathedral and the Arc de Triomphe. Visitors can also explore the city's many museums, including the Musée d'Orsay and the Musée Rodin.\nIn addition to its cultural and historical attractions, Paris is also a great destination for foodies. The city is famous for its cuisine, including croissants, baguettes, and cheese. Visitors can sample the city's famous dishes at one of the many restaurants, cafes, and "
]
tok_ids = [llm_spec.tokenizer.encode(prompts[0])]
else:
prompts = [
"The capital of France is",
"The president of the United States is",
]
tok_ids = [llm_spec.tokenizer.encode("The future of AI is")]
if multi_batch:
tok_ids.append(llm_spec.tokenizer.encode(prompts))
sampling_params = SamplingParams(max_tokens=128, temperature=0)
for i in range(len(tok_ids)):
num_tokens = 0
num_drafted = 0
num_accepted = 0
for output in llm_spec.generate_async(tok_ids[i],
sampling_params,
streaming=True):
new_tokens = output.outputs[0].token_ids
num_drafted += max_draft_len
num_accepted += len(new_tokens) - num_tokens - 1
num_tokens = len(new_tokens)
accept_rate = num_accepted / num_drafted
assert accept_rate > 0.10
# Output tests
sampling_params = SamplingParams(max_tokens=10, temperature=0)
results_spec = llm_spec.generate(prompts, sampling_params)
generated_text_spec = [result.outputs[0].text for result in results_spec]
llm_spec.shutdown()
llm_ref = LLM(**llm_common_config)
results_ref = llm_ref.generate(prompts, sampling_params)
generated_text_ref = [result.outputs[0].text for result in results_ref]
llm_ref.shutdown()
for text_spec, text_ref in zip(generated_text_spec, generated_text_ref):
# The spec decode algorithm currently guarantees identical results
assert text_spec == text_ref
if __name__ == "__main__":
unittest.main()