From 1b1058279c56e3fa188a05b8aad5b542b7ed7533 Mon Sep 17 00:00:00 2001 From: xinhe-nv <200704525+xinhe-nv@users.noreply.github.com> Date: Tue, 6 Jan 2026 12:02:27 +0800 Subject: [PATCH 01/15] [TRTLLM-8638][fix] Add failed cases into waives.txt (#10384) Signed-off-by: xinhe-nv <200704525+xinhe-nv@users.noreply.github.com> --- tests/integration/test_lists/waives.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt index 88dd569abe..9f66ce0b16 100644 --- a/tests/integration/test_lists/waives.txt +++ b/tests/integration/test_lists/waives.txt @@ -505,3 +505,5 @@ unittest/_torch/attention/test_flashinfer_star_attn.py::TestStarAttention::test_ unittest/_torch/ray_orchestrator/multi_gpu/test_ops.py::test_reducescatter_pg_op[var_len:True-seqlen:16-hidden:128] SKIP (https://nvbugs/5781383) cpp/test_e2e.py::test_model[-mamba-86] SKIP (https://nvbugs/5781665) unittest/llmapi/test_llm_multi_gpu_pytorch.py::test_tinyllama_logits_processor_tp2pp2 SKIP (https://nvbugs/5781731) +accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_tp_pp_symmetric[GSM8K-tp2pp2] SKIP (https://nvbugs/5756008) +accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_guided_decoding_4gpus[llguidance] SKIP (https://nvbugs/5756008) From 22a1d31a273d2ff905fc55838527c6b680962d1e Mon Sep 17 00:00:00 2001 From: Ivy Zhang <25222398+crazydemo@users.noreply.github.com> Date: Tue, 6 Jan 2026 12:28:59 +0800 Subject: [PATCH 02/15] [None][test] update test case constraint (#10381) Signed-off-by: Ivy Zhang <25222398+crazydemo@users.noreply.github.com> --- tests/integration/defs/accuracy/test_llm_api_pytorch.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py index 58de46628f..a456cbc5a4 100644 --- a/tests/integration/defs/accuracy/test_llm_api_pytorch.py +++ b/tests/integration/defs/accuracy/test_llm_api_pytorch.py @@ -2269,6 +2269,7 @@ class TestDeepSeekR1(LlmapiAccuracyTestHarness): torch.cuda.empty_cache() @skip_pre_blackwell + @pytest.mark.skip_less_device_memory(95000) @pytest.mark.parametrize( "tp_size,pp_size,ep_size,mtp_nextn,fp8kv,attention_dp,cuda_graph,overlap_scheduler,max_batch_size,moe_backend", [ From 810249c304979f6dd8d7970c79a1bf353598ba86 Mon Sep 17 00:00:00 2001 From: Kaiyu Xie <26294424+kaiyux@users.noreply.github.com> Date: Tue, 6 Jan 2026 13:09:25 +0800 Subject: [PATCH 03/15] [https://nvbugs/5769926] [fix] Add no container mount home WAR (#10431) Signed-off-by: Kaiyu Xie <26294424+kaiyux@users.noreply.github.com> --- examples/disaggregated/slurm/benchmark/submit.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/disaggregated/slurm/benchmark/submit.py b/examples/disaggregated/slurm/benchmark/submit.py index e77671caa2..a12e675134 100644 --- a/examples/disaggregated/slurm/benchmark/submit.py +++ b/examples/disaggregated/slurm/benchmark/submit.py @@ -287,7 +287,7 @@ def submit_job(config, log_dir, dry_run): f"--container-image {env_config['container_image']}", f"--container-name {container_name}", f"--container-mounts {env_config['container_mount']}", - "--mpi=pmix --overlap", + "--no-container-mount-home --mpi=pmix --overlap", f"bash {os.path.join(env_config['work_dir'], 'start_worker.sh')}", server_type, str(server_id), @@ -313,7 +313,7 @@ def submit_job(config, log_dir, dry_run): f"--container-name={container_name}", f"--container-image={env_config['container_image']}", f"--container-mounts={env_config['container_mount']}", - f"--mpi=pmix --overlap -N 1 -n 1", + f"--no-container-mount-home --mpi=pmix --overlap -N 1 -n 1", f"bash {env_config['work_dir']}/start_server.sh {os.path.join(log_dir, 'server_config.yaml')} \"{server_env_var}\"", f"&> {log_dir}/4_output_server.log &", ] From 998527724ca52175532ddfc01e24967c7ea797eb Mon Sep 17 00:00:00 2001 From: xinhe-nv <200704525+xinhe-nv@users.noreply.github.com> Date: Tue, 6 Jan 2026 14:09:21 +0800 Subject: [PATCH 04/15] [TRTLLM-8638][fix] Add failed cases into waives.txt (#10367) Signed-off-by: xinhe-nv <200704525+xinhe-nv@users.noreply.github.com> --- tests/integration/test_lists/waives.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt index 9f66ce0b16..2ca4fbf444 100644 --- a/tests/integration/test_lists/waives.txt +++ b/tests/integration/test_lists/waives.txt @@ -507,3 +507,4 @@ cpp/test_e2e.py::test_model[-mamba-86] SKIP (https://nvbugs/5781665) unittest/llmapi/test_llm_multi_gpu_pytorch.py::test_tinyllama_logits_processor_tp2pp2 SKIP (https://nvbugs/5781731) accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_tp_pp_symmetric[GSM8K-tp2pp2] SKIP (https://nvbugs/5756008) accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_guided_decoding_4gpus[llguidance] SKIP (https://nvbugs/5756008) +disaggregated/test_auto_scaling.py::test_worker_restart[etcd-round_robin] SKIP (https://nvbugs/5776445) From 5108a69fc07d8dcc9529918b3c032e87a144e60c Mon Sep 17 00:00:00 2001 From: Yiqing Yan Date: Tue, 6 Jan 2026 14:39:55 +0800 Subject: [PATCH 05/15] [TRTLLM-9622][infra] Enable DGX_B300 multi-gpu testing in pre-merge pipeline (#9699) Signed-off-by: Yiqing Yan --- jenkins/L0_Test.groovy | 3 +- .../test_lists/test-db/l0_dgx_b300.yml | 29 ++++++++++++++----- 2 files changed, 24 insertions(+), 8 deletions(-) diff --git a/jenkins/L0_Test.groovy b/jenkins/L0_Test.groovy index be84a32cfd..f3a8226167 100644 --- a/jenkins/L0_Test.groovy +++ b/jenkins/L0_Test.groovy @@ -3256,12 +3256,13 @@ def launchTestJobs(pipeline, testFilter) "DGX_H100-4_GPUs-PyTorch-GptOss-1": ["dgx-h100-x4-oci", "l0_dgx_h100", 1, 1, 4], "DGX_H100-4_GPUs-PyTorch-Others-1": ["dgx-h100-x4-oci", "l0_dgx_h100", 1, 1, 4], "DGX_H100-4_GPUs-PyTorch-Ray-1": ["dgx-h100-x4-oci", "l0_dgx_h100", 1, 1, 4], - "B300-PyTorch-1": ["b300-single", "l0_b300", 1, 1], "DGX_B200-4_GPUs-PyTorch-1": ["b200-x4-lbd", "l0_dgx_b200", 1, 1, 4, 1, true], "DGX_B200-4_GPUs-PyTorch-Ray-1": ["b200-x4-lbd", "l0_dgx_b200", 1, 1, 4, 1, true], "DGX_B200-8_GPUs-PyTorch-1": ["b200-x8-lbd", "l0_dgx_b200", 1, 1, 8, 1, true], "DGX_B200-4_GPUs-PyTorch-Post-Merge-1": ["b200-x4-lbd", "l0_dgx_b200", 1, 2, 4, 1, true], "DGX_B200-4_GPUs-PyTorch-Post-Merge-2": ["b200-x4-lbd", "l0_dgx_b200", 2, 2, 4, 1, true], + "B300-PyTorch-1": ["b300-single", "l0_b300", 1, 1], + "DGX_B300-4_GPUs-PyTorch-1": ["b300-x4", "l0_dgx_b300", 1, 1, 4], "DGX_B300-4_GPUs-PyTorch-Post-Merge-1": ["b300-x4", "l0_dgx_b300", 1, 2, 4], "DGX_B300-4_GPUs-PyTorch-Post-Merge-2": ["b300-x4", "l0_dgx_b300", 2, 2, 4], // PerfSanity post-merge tests diff --git a/tests/integration/test_lists/test-db/l0_dgx_b300.yml b/tests/integration/test_lists/test-db/l0_dgx_b300.yml index 749f032fed..c09e3a0415 100644 --- a/tests/integration/test_lists/test-db/l0_dgx_b300.yml +++ b/tests/integration/test_lists/test-db/l0_dgx_b300.yml @@ -31,11 +31,9 @@ l0_dgx_b300: - accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_4gpus[pp4-fp8kv=False-attn_backend=TRTLLM-torch_compile=False] - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp4-mtp_nextn=0-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=True] - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp4-mtp_nextn=2-attention_dp=False-cuda_graph=True-overlap_scheduler=True-torch_compile=False] - - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[ep4-mtp_nextn=2-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[ep4-mtp_nextn=2-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=True] - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp2pp2-mtp_nextn=0-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=True] - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp2pp2-mtp_nextn=2-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=True] - - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp2pp2-mtp_nextn=2-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[pp4-mtp_nextn=0-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=True] - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[pp4-mtp_nextn=2-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=0-tp4-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=True] @@ -44,15 +42,12 @@ l0_dgx_b300: - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=0-tp2pp2-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=0-pp4-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False] - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=0-pp4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=True] - - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=TRTLLM-mtp_nextn=0-tp4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=TRTLLM-mtp_nextn=0-ep4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=2-tp4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=2-ep4-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False] - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=2-tp2pp2-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False] - - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=2-tp2pp2-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=2-pp4-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False] - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=TRTLLM-mtp_nextn=2-ep4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] - - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[tep4_latency_moe_trtllm-torch_compile=False] - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[tep4_latency_moe_trtllm-torch_compile=True] - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[tep4_latency_moe_cutlass-torch_compile=True] - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[dep4_latency_moe_trtllm-torch_compile=False] @@ -61,11 +56,9 @@ l0_dgx_b300: - disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_ucx[DeepSeek-V3-Lite-fp8] - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[tp4-cutlass-fp8] - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[tp4-cutlass-auto] - - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[ep4-trtllm-auto] - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[dp4-triton-auto] - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4a16[dp4-auto] - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4a16[dp4-fp8] - - disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_nixl[DeepSeek-V3-Lite-fp8] - disaggregated/test_disaggregated.py::test_disaggregated_benchmark_on_diff_backends[DeepSeek-V3-Lite-bf16] - disaggregated/test_disaggregated.py::test_disaggregated_benchmark_on_diff_backends[llama-v3-8b-hf] - disaggregated/test_disaggregated.py::test_disaggregated_benchmark_on_diff_backends[llama-3.1-8b-instruct-hf-fp8] @@ -75,3 +68,25 @@ l0_dgx_b300: - accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput_tp4] TIMEOUT (180) - accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput_pp4_mtp] TIMEOUT (180) # ------------- AutoDeploy tests --------------- +- condition: + ranges: + system_gpu_count: + gte: 4 + lte: 4 + wildcards: + gpu: + - '*gb110*' + - '*b300*' + linux_distribution_name: ubuntu* + cpu: x86_64 + terms: + stage: pre_merge + backend: pytorch + tests: + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[ep4-mtp_nextn=2-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp2pp2-mtp_nextn=2-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] + - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[tep4_latency_moe_trtllm-torch_compile=False] + - disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_nixl[DeepSeek-V3-Lite-fp8] + - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[ep4-trtllm-auto] + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=TRTLLM-mtp_nextn=0-tp4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=2-tp2pp2-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] From 1e828587e5aeb995cd9b435fd22906da11221a4d Mon Sep 17 00:00:00 2001 From: Ivy Zhang <25222398+crazydemo@users.noreply.github.com> Date: Tue, 6 Jan 2026 15:02:29 +0800 Subject: [PATCH 06/15] [TRTLLM-9896][test] add vswa test cases coverage (#10146) Signed-off-by: Ivy Zhang <25222398+crazydemo@users.noreply.github.com> --- .../accuracy/references/json_mode_eval.yaml | 11 ++ .../defs/accuracy/test_llm_api_pytorch.py | 139 ++++++++++++++++++ .../test_lists/qa/llm_function_core.txt | 6 + .../test_lists/qa/llm_function_rtx6k.txt | 15 ++ tests/integration/test_lists/waives.txt | 2 + 5 files changed, 173 insertions(+) diff --git a/tests/integration/defs/accuracy/references/json_mode_eval.yaml b/tests/integration/defs/accuracy/references/json_mode_eval.yaml index 0a1330cd9d..0d36ea6d26 100644 --- a/tests/integration/defs/accuracy/references/json_mode_eval.yaml +++ b/tests/integration/defs/accuracy/references/json_mode_eval.yaml @@ -8,3 +8,14 @@ deepseek-ai/DeepSeek-V3-Lite: - accuracy: 77.00 - spec_dec_algo: MTP accuracy: 77.00 +google/gemma-3-1b-it: + - quant_algo: FP8 + kv_cache_quant_algo: FP8 + accuracy: 61.00 +GPT-OSS/120B-MXFP4: + - quant_algo: W4A16_MXFP4 + spec_dec_algo: Eagle + accuracy: 62.00 + - quant_algo: W4A8_MXFP4_MXFP8 + spec_dec_algo: Eagle + accuracy: 62.00 diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py index a456cbc5a4..ee416eb247 100644 --- a/tests/integration/defs/accuracy/test_llm_api_pytorch.py +++ b/tests/integration/defs/accuracy/test_llm_api_pytorch.py @@ -1105,6 +1105,37 @@ class TestGemma3_1BInstruct(LlmapiAccuracyTestHarness): task = MMLU(self.MODEL_NAME) task.evaluate(llm) + def test_fp8_vswa_reuse(self): + # NOTE: Test with VSWA kv cache config. + kv_cache_config = KvCacheConfig( + enable_block_reuse=True, + max_attention_window=[512, 512, 512, 512, 512, 32768], + ) + prequantized_model_path = f"{llm_models_root()}/gemma/gemma-3-1b-it-fp8/" + with LLM(prequantized_model_path, + kv_cache_config=kv_cache_config) as llm: + task = GSM8K(self.MODEL_NAME) + task.evaluate(llm) + task = MMLU(self.MODEL_NAME) + task.evaluate(llm) + + @pytest.mark.parametrize("backend", ["xgrammar"]) + def test_fp8_guided_decoding_vswa_reuse(self, backend: str, mocker): + mocker.patch.dict(os.environ, {"TRTLLM_XGUIDANCE_LENIENT": "1"}) + prequantized_model_path = f"{llm_models_root()}/gemma/gemma-3-1b-it-fp8/" + kv_cache_config = KvCacheConfig( + enable_block_reuse=True, + max_attention_window=[512, 512, 512, 512, 512, 32768], + ) + cuda_graph_config = CudaGraphConfig(enable_padding=True) + llm = LLM(prequantized_model_path, + guided_decoding_backend=backend, + kv_cache_config=kv_cache_config, + cuda_graph_config=cuda_graph_config) + with llm: + task = JsonModeEval(self.MODEL_NAME) + task.evaluate(llm) + def test_auto_dtype_vswa_without_reuse(self): # NOTE: Test with VSWA kv cache config. kv_cache_config = KvCacheConfig( @@ -4461,6 +4492,114 @@ class TestGPTOSS(LlmapiAccuracyTestHarness): sampling_params=sampling_params, extra_evaluator_kwargs=extra_evaluator_kwargs) + @pytest.mark.skip_less_device(4) + @pytest.mark.parametrize("one_model", [True, False], + ids=["one_model", "two_model"]) + def test_eagle3_vswa_reuse_4gpus(self, one_model, mocker): + MAX_OUTPUT_LEN = 128179 + MAX_INPUT_LEN = 32768 + + mocker.patch.object(GSM8K, "MAX_OUTPUT_LEN", 8192) + mocker.patch.dict(GSM8K.EVALUATE_KWARGS, + {"scores_filter": "exact_match,flexible-extract"}) + + mocker.patch.object(GPQADiamond, "MAX_OUTPUT_LEN", MAX_OUTPUT_LEN) + mocker.patch.object(GPQADiamond, "MAX_INPUT_LEN", MAX_INPUT_LEN) + + pytorch_config = dict(cuda_graph_config=CudaGraphConfig()) + kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.4, + dtype="auto", + enable_block_reuse=True, + max_attention_window=[128, 32768]) + + eagle_model_dir = f"{llm_models_root()}/gpt_oss/gpt-oss-120b-Eagle3" + draft_len = 3 + spec_config = EagleDecodingConfig(max_draft_len=draft_len, + speculative_model_dir=eagle_model_dir, + eagle3_one_model=one_model, + allow_advanced_sampling=True) + + max_seq_len = MAX_INPUT_LEN + MAX_OUTPUT_LEN + llm = LLM(self.MODEL_PATH, + tensor_parallel_size=4, + pipeline_parallel_size=1, + moe_expert_parallel_size=1, + kv_cache_config=kv_cache_config, + max_seq_len=max_seq_len, + speculative_config=spec_config, + **pytorch_config, + enable_attention_dp=False) + + with llm: + model_name = "GPT-OSS/120B-MXFP4" + + # GSM8K + task = GSM8K(model_name) + task.evaluate(llm, + extra_evaluator_kwargs=self.extra_evaluator_kwargs) + + # GPQA Medium Reasoning + task = GPQADiamond(model_name) + + chat_template_kwargs = dict(reasoning_effort="medium") + extra_evaluator_kwargs = { + **self.extra_evaluator_kwargs, "chat_template_kwargs": + chat_template_kwargs + } + + sampling_params = SamplingParams( + temperature=1.0, + top_p=1.0, + max_tokens=MAX_OUTPUT_LEN, + truncate_prompt_tokens=MAX_INPUT_LEN) + + task.evaluate(llm, + sampling_params=sampling_params, + extra_evaluator_kwargs=extra_evaluator_kwargs) + + @pytest.mark.skip_less_device(4) + @pytest.mark.parametrize("one_model", [True, False], + ids=["one_model", "two_model"]) + def test_eagle3_guided_decoding_4gpus(self, one_model, mocker): + MAX_OUTPUT_LEN = 128179 + MAX_INPUT_LEN = 32768 + + mocker.patch.dict(os.environ, {"TRTLLM_XGUIDANCE_LENIENT": "1"}) + mocker.patch.object(GSM8K, "MAX_OUTPUT_LEN", 8192) + mocker.patch.dict(GSM8K.EVALUATE_KWARGS, + {"scores_filter": "exact_match,flexible-extract"}) + + mocker.patch.object(GPQADiamond, "MAX_OUTPUT_LEN", MAX_OUTPUT_LEN) + mocker.patch.object(GPQADiamond, "MAX_INPUT_LEN", MAX_INPUT_LEN) + + pytorch_config = dict(cuda_graph_config=CudaGraphConfig()) + kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.4, + dtype="auto") + + eagle_model_dir = f"{llm_models_root()}/gpt_oss/gpt-oss-120b-Eagle3" + draft_len = 3 + spec_config = EagleDecodingConfig(max_draft_len=draft_len, + speculative_model_dir=eagle_model_dir, + eagle3_one_model=one_model, + allow_advanced_sampling=True) + + max_seq_len = MAX_INPUT_LEN + MAX_OUTPUT_LEN + llm = LLM(self.MODEL_PATH, + tensor_parallel_size=4, + pipeline_parallel_size=1, + moe_expert_parallel_size=1, + guided_decoding_backend="xgrammar", + kv_cache_config=kv_cache_config, + max_seq_len=max_seq_len, + speculative_config=spec_config, + **pytorch_config, + enable_attention_dp=False) + + with llm: + model_name = "GPT-OSS/120B-MXFP4" + task = JsonModeEval(model_name) + task.evaluate(llm) + @pytest.mark.skip_less_device(2) @pytest.mark.timeout(14400) @pytest.mark.parametrize("overlap_scheduler", [True, False], diff --git a/tests/integration/test_lists/qa/llm_function_core.txt b/tests/integration/test_lists/qa/llm_function_core.txt index c0ad85da97..541783d36b 100644 --- a/tests/integration/test_lists/qa/llm_function_core.txt +++ b/tests/integration/test_lists/qa/llm_function_core.txt @@ -430,6 +430,8 @@ accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_eagle3_tp8[ accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_eagle3_tp8[eagle3_one_model=False-torch_compile=True] accuracy/test_llm_api_pytorch.py::TestMistral7B::test_auto_dtype accuracy/test_llm_api_pytorch.py::TestGemma3_1BInstruct::test_auto_dtype +accuracy/test_llm_api_pytorch.py::TestGemma3_1BInstruct::test_fp8_vswa_reuse +accuracy/test_llm_api_pytorch.py::TestGemma3_1BInstruct::test_fp8_guided_decoding_vswa_reuse[xgrammar] accuracy/test_llm_api_pytorch.py::TestGemma3_27BInstruct::test_fp8_prequantized accuracy/test_llm_api_pytorch.py::TestGemma3_27BInstruct::test_auto_dtype accuracy/test_llm_api_pytorch.py::TestMistralSmall24B::test_auto_dtype @@ -613,6 +615,10 @@ accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[triton-one_model accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[triton-one_model-no_overlap_scheduler] accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[triton-two_model-overlap_scheduler] accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[triton-two_model-no_overlap_scheduler] +accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_vswa_reuse_4gpus[one_model] +accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_vswa_reuse_4gpus[two_model] +accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_guided_decoding_4gpus[one_model] +accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_guided_decoding_4gpus[two_model] accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_auto_dtype[False-False-False] accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_auto_dtype[True-True-True] accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ngram diff --git a/tests/integration/test_lists/qa/llm_function_rtx6k.txt b/tests/integration/test_lists/qa/llm_function_rtx6k.txt index 36992b9937..51cff78ff3 100644 --- a/tests/integration/test_lists/qa/llm_function_rtx6k.txt +++ b/tests/integration/test_lists/qa/llm_function_rtx6k.txt @@ -151,6 +151,9 @@ accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[ep4-cutlass-auto] accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[tp4-cutlass-auto] accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4a16[dp4-auto] accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_chunked_prefill[cutlass-auto] +accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm] +accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm_eagle3] +accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus_sm120[throughput_tp8] accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[cutlass-one_model-overlap_scheduler] accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[cutlass-one_model-no_overlap_scheduler] accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[cutlass-two_model-overlap_scheduler] @@ -204,6 +207,18 @@ accuracy/test_disaggregated_serving.py::TestLlama4ScoutInstruct::test_auto_dtype accuracy/test_disaggregated_serving.py::TestQwen3_8B::test_auto_dtype[False] accuracy/test_disaggregated_serving.py::TestQwen3_8B::test_auto_dtype[True] +accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[trtllm-one_model-overlap_scheduler] +accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[trtllm-one_model-no_overlap_scheduler] +accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[trtllm-two_model-overlap_scheduler] +accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[trtllm-two_model-no_overlap_scheduler] +accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[triton-one_model-overlap_scheduler] +accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[triton-one_model-no_overlap_scheduler] +accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[triton-two_model-overlap_scheduler] +accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[triton-two_model-no_overlap_scheduler] +accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_vswa_reuse_4gpus[one_model] +accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_vswa_reuse_4gpus[two_model] +accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_guided_decoding_4gpus[one_model] +accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_guided_decoding_4gpus[two_model] test_e2e.py::test_ptp_quickstart_advanced_mixed_precision test_e2e.py::test_ptp_quickstart_advanced[Llama3.1-8B-BF16-llama-3.1-model/Meta-Llama-3.1-8B] test_e2e.py::test_ptp_quickstart_advanced[Llama3.1-8B-FP8-llama-3.1-model/Llama-3.1-8B-Instruct-FP8] diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt index 2ca4fbf444..be437eb6df 100644 --- a/tests/integration/test_lists/waives.txt +++ b/tests/integration/test_lists/waives.txt @@ -508,3 +508,5 @@ unittest/llmapi/test_llm_multi_gpu_pytorch.py::test_tinyllama_logits_processor_t accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_tp_pp_symmetric[GSM8K-tp2pp2] SKIP (https://nvbugs/5756008) accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_guided_decoding_4gpus[llguidance] SKIP (https://nvbugs/5756008) disaggregated/test_auto_scaling.py::test_worker_restart[etcd-round_robin] SKIP (https://nvbugs/5776445) +accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_vswa_reuse_4gpus[one_model] SKIP (https://nvbugs/5756028) +accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_vswa_reuse_4gpus[two_model] SKIP (https://nvbugs/5756028) From 2eaabd7461b518bac8a8ec69e981058a2cbc4f04 Mon Sep 17 00:00:00 2001 From: Kaiyu Xie <26294424+kaiyux@users.noreply.github.com> Date: Tue, 6 Jan 2026 15:42:37 +0800 Subject: [PATCH 07/15] [None] [fix] Fix undefined tokens_per_block (#10438) Signed-off-by: Kaiyu Xie <26294424+kaiyux@users.noreply.github.com> --- tensorrt_llm/_torch/attention_backend/sparse/dsa.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tensorrt_llm/_torch/attention_backend/sparse/dsa.py b/tensorrt_llm/_torch/attention_backend/sparse/dsa.py index 80e37bba7d..aa32d6317e 100644 --- a/tensorrt_llm/_torch/attention_backend/sparse/dsa.py +++ b/tensorrt_llm/_torch/attention_backend/sparse/dsa.py @@ -813,13 +813,14 @@ class DSAtrtllmAttentionMetadata(TrtllmAttentionMetadata): # Expand schedule metadata buffer (only generation) kv_lens_expanded = self.kv_lens_expanded_cuda[:num_tokens] scheduler_metadata_buffer_expanded = get_paged_mqa_logits_metadata( - kv_lens_expanded, tokens_per_block, self.num_sms) + kv_lens_expanded, self.kv_cache_manager.tokens_per_block, + self.num_sms) self.scheduler_metadata_buffer_expanded.copy_( scheduler_metadata_buffer_expanded, non_blocking=True) elif self.max_draft_tokens == 3: scheduler_metadata_buffer_mtp3 = get_paged_mqa_logits_metadata( self.kv_lens_cuda[self.num_contexts:self.num_seqs], - tokens_per_block, self.num_sms // 2) + self.kv_cache_manager.tokens_per_block, self.num_sms // 2) self.scheduler_metadata_buffer_mtp3.copy_( scheduler_metadata_buffer_mtp3, non_blocking=True) self.prepare_dense_topk_indices(self.kv_lens_cuda, device=True) From ab58d7cac18647dc4440efec552c4901942344d0 Mon Sep 17 00:00:00 2001 From: William Zhang <133824995+2ez4bz@users.noreply.github.com> Date: Mon, 5 Jan 2026 23:49:54 -0800 Subject: [PATCH 08/15] [https://nvbugs/5772361][ci] Unwaive tests that have been fixed (#10424) These tests were all failing due to the same issue, and were fixed in #10394. Signed-off-by: William Zhang <133824995+2ez4bz@users.noreply.github.com> --- tests/integration/test_lists/waives.txt | 3 --- 1 file changed, 3 deletions(-) diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt index be437eb6df..6e6820e71c 100644 --- a/tests/integration/test_lists/waives.txt +++ b/tests/integration/test_lists/waives.txt @@ -472,15 +472,12 @@ accuracy/test_llm_api_pytorch_multimodal.py::TestMistralLarge3_675B::test_nvfp4_ accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[dp4-trtllm-fp8] SKIP (https://nvbugs/5772396) full:sm100/accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[dp4-trtllm-auto] SKIP (https://nvbugs/5772396) accuracy/test_llm_api_pytorch.py::TestGLM4_6::test_nvfp4_2_model_mtp[2model_trtllm] SKIP (https://nvbugs/5772360) -accuracy/test_llm_api_pytorch.py::TestMistralSmall24B::test_fp8 SKIP (https://nvbugs/5772361) accuracy/test_llm_api_pytorch.py::TestGLM4_6::test_nvfp4_2_model_mtp[2model] SKIP (https://nvbugs/5772993) -test_e2e.py::test_ptp_quickstart_multimodal_chunked_prefill[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503-0.6-image] SKIP (https://nvbugs/5772363) accuracy/test_llm_api.py::TestLlama3_1_8BInstruct::test_gather_generation_logits_cuda_graph SKIP (https://nvbugs/5772995) test_e2e.py::test_eagle3_output_consistency_4gpus[Qwen3/Qwen3-30B-A3B-Qwen3/Qwen3-30B-eagle3] SKIP (https://nvbugs/5685010) full:sm89/accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_tp_pp_symmetric[GSM8K-tp2pp2] SKIP (https://nvbugs/5773047) accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[enable_configurable_moe-dp4-trtllm-fp8] SKIP (https://nvbugs/5773201) unittest/_torch/auto_deploy/unit/multigpu/transformations/library/test_tp_sharding.py::test_sharding[GQA_Block-torch_dist_all_reduce-True-False-2] SKIP (https://nvbugs/5766982) -test_e2e.py::test_ptp_quickstart_multimodal_kv_cache_reuse[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503-0.6-image] SKIP (https://nvbugs/5773195) accuracy/test_cli_flow.py::TestVicuna7B::test_eagle_2[cuda_graph=True-chunked_context=True] SKIP (https://nvbugs/5773185) accuracy/test_cli_flow.py::TestVicuna7B::test_eagle_2[cuda_graph=True-chunked_context=False] SKIP (https://nvbugs/5773185) accuracy/test_disaggregated_serving.py::TestGPTOSS::test_auto_dtype[True] SKIP (https://nvbugs/5596343) From df0b976b9911ed043fcd357952f5e6dbfe5abb58 Mon Sep 17 00:00:00 2001 From: Bo Li <22713281+bobboli@users.noreply.github.com> Date: Tue, 6 Jan 2026 16:32:19 +0800 Subject: [PATCH 09/15] [https://nvbugs/5785206][infra] Waive TestQwen3_30B_A3B::test_fp8[latency-torch_compile=False]. (#10441) Signed-off-by: Bo Li <22713281+bobboli@users.noreply.github.com> --- tests/integration/test_lists/waives.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt index 6e6820e71c..49e547bcf8 100644 --- a/tests/integration/test_lists/waives.txt +++ b/tests/integration/test_lists/waives.txt @@ -507,3 +507,4 @@ accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_guided_decoding_ disaggregated/test_auto_scaling.py::test_worker_restart[etcd-round_robin] SKIP (https://nvbugs/5776445) accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_vswa_reuse_4gpus[one_model] SKIP (https://nvbugs/5756028) accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_vswa_reuse_4gpus[two_model] SKIP (https://nvbugs/5756028) +accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_fp8[latency-torch_compile=False] SKIP (https://nvbugs/5785206) From 6507087c3febf8e17a3973106b22143ada1be6ea Mon Sep 17 00:00:00 2001 From: Emma Qiao Date: Tue, 6 Jan 2026 16:54:54 +0800 Subject: [PATCH 10/15] [None][infra] Waive failed cases on 1/6 (#10440) Signed-off-by: qqiao --- tests/integration/test_lists/waives.txt | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt index 49e547bcf8..44fb7dbd92 100644 --- a/tests/integration/test_lists/waives.txt +++ b/tests/integration/test_lists/waives.txt @@ -504,6 +504,14 @@ cpp/test_e2e.py::test_model[-mamba-86] SKIP (https://nvbugs/5781665) unittest/llmapi/test_llm_multi_gpu_pytorch.py::test_tinyllama_logits_processor_tp2pp2 SKIP (https://nvbugs/5781731) accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_tp_pp_symmetric[GSM8K-tp2pp2] SKIP (https://nvbugs/5756008) accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_guided_decoding_4gpus[llguidance] SKIP (https://nvbugs/5756008) +accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_4gpus[tp2pp2-fp8kv=False-attn_backend=TRTLLM-torch_compile=False] SKIP (https://nvbugs/5784526) +unittest/_torch/modules/test_fused_moe.py::test_fused_moe_multi_gpu[1-CUTLASS] SKIP (https://nvbugs/5784543) +accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_4gpus[tp4-fp8kv=False-attn_backend=FLASHINFER-torch_compile=False] SKIP (https://nvbugs/5707359) +accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ctx_pp_gen_tp_asymmetric[GSM8K-gen_tp=1-ctx_pp=2] SKIP (https://nvbugs/5673559) +accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[ep4-mtp_nextn=0-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False] SKIP (https://nvbugs/5701445) +accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[pp4-mtp_nextn=2-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] SKIP (https://nvbugs/5740075) +accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_guided_decoding[llguidance-mtp_nextn=0] SKIP (https://nvbugs/5748600) +unittest/_torch/ray_orchestrator/multi_gpu/test_multi_instance.py::test_multi_instance[tp2_2instances] SKIP (https://nvbugs/5784566) disaggregated/test_auto_scaling.py::test_worker_restart[etcd-round_robin] SKIP (https://nvbugs/5776445) accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_vswa_reuse_4gpus[one_model] SKIP (https://nvbugs/5756028) accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_vswa_reuse_4gpus[two_model] SKIP (https://nvbugs/5756028) From 704f58dfbee141bb1bf28d84d69620dfeeb00068 Mon Sep 17 00:00:00 2001 From: xinhe-nv <200704525+xinhe-nv@users.noreply.github.com> Date: Tue, 6 Jan 2026 17:47:54 +0800 Subject: [PATCH 11/15] [TRTLLM-8638][fix] Add failed cases into waives.txt (#10427) Signed-off-by: xinhe-nv <200704525+xinhe-nv@users.noreply.github.com> --- tests/integration/test_lists/waives.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt index 44fb7dbd92..6c0b5aedd2 100644 --- a/tests/integration/test_lists/waives.txt +++ b/tests/integration/test_lists/waives.txt @@ -516,3 +516,4 @@ disaggregated/test_auto_scaling.py::test_worker_restart[etcd-round_robin] SKIP ( accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_vswa_reuse_4gpus[one_model] SKIP (https://nvbugs/5756028) accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_vswa_reuse_4gpus[two_model] SKIP (https://nvbugs/5756028) accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_fp8[latency-torch_compile=False] SKIP (https://nvbugs/5785206) +examples/test_gpt.py::test_llm_gpt2_parallel_embedding_2gpu[float16-0] SKIP (https://nvbugs/5784518) From 7d62773c6c88244b574f5d95d8ec3acf0c4bffee Mon Sep 17 00:00:00 2001 From: JunyiXu-nv <219237550+JunyiXu-nv@users.noreply.github.com> Date: Tue, 6 Jan 2026 23:25:46 +0800 Subject: [PATCH 12/15] [https://nvbugs/5760726][fix] Use random port in container port section (#10432) Signed-off-by: Junyi Xu <219237550+JunyiXu-nv@users.noreply.github.com> --- tests/integration/defs/common.py | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/tests/integration/defs/common.py b/tests/integration/defs/common.py index 6d82069e99..4c42032f54 100644 --- a/tests/integration/defs/common.py +++ b/tests/integration/defs/common.py @@ -15,6 +15,7 @@ import copy import os import platform +import random import re import socket import tempfile @@ -1162,28 +1163,33 @@ def get_free_port_in_ci(max_attempts=100): Get a free port in the range [CONTAINER_PORT_START, CONTAINER_PORT_START + CONTAINER_PORT_NUM - 1] If CONTAINER_PORT_START and CONTAINER_PORT_NUM are not set or all ports are already in use, fallback to get_free_port """ + global PORTS_IN_USE + container_port_start = int(os.environ.get("CONTAINER_PORT_START", -1)) container_port_num = int(os.environ.get("CONTAINER_PORT_NUM", -1)) if container_port_start != -1 and container_port_num != -1: - for i in range(container_port_num): - port = container_port_start + i - if port in PORTS_IN_USE: - continue + available_ports = [ + port for port in range(container_port_start, container_port_start + + container_port_num) + if port not in PORTS_IN_USE + ] + + for _ in range(len(available_ports)): + # Get a random port from the available ports + port = random.choice(available_ports) # Check if the port is free with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: try: s.bind(("localhost", port)) - - # Port is free, add it to the set of used ports PORTS_IN_USE.add(port) return port except OSError: - # Port is not free, try the next port + available_ports.remove(port) continue # No port found in the range, try to get a random free port from the system - for i in range(max_attempts): + for _ in range(max_attempts): port = get_free_port() if port not in PORTS_IN_USE: PORTS_IN_USE.add(port) From 6a4bebcd0169605821ecfc4c176efcc7147beb5d Mon Sep 17 00:00:00 2001 From: Lizhi Zhou <1432185+reasonsolo@users.noreply.github.com> Date: Tue, 6 Jan 2026 23:39:15 +0800 Subject: [PATCH 13/15] [None][chore] remove redundant retries while binding to arbitrary port (#10452) Signed-off-by: Lizhi Zhou <1432185+reasonsolo@users.noreply.github.com> --- tensorrt_llm/commands/serve.py | 27 +++++++-------------------- 1 file changed, 7 insertions(+), 20 deletions(-) diff --git a/tensorrt_llm/commands/serve.py b/tensorrt_llm/commands/serve.py index ff189e3be9..9eb271551a 100644 --- a/tensorrt_llm/commands/serve.py +++ b/tensorrt_llm/commands/serve.py @@ -18,7 +18,7 @@ from torch.cuda import device_count from tensorrt_llm import LLM as PyTorchLLM from tensorrt_llm import MultimodalEncoder from tensorrt_llm._tensorrt_engine import LLM -from tensorrt_llm._utils import get_free_port, mpi_rank +from tensorrt_llm._utils import mpi_rank from tensorrt_llm.executor.utils import LlmLauncherEnvs from tensorrt_llm.inputs.multimodal import MultimodalServerConfig from tensorrt_llm.llmapi import (BuildConfig, CapacitySchedulerPolicy, @@ -189,25 +189,12 @@ def launch_server( with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: # If disagg cluster config is provided and port is not specified, try to find a free port, otherwise try to bind to the specified port assert port > 0 or disagg_cluster_config is not None, "Port must be specified if disagg cluster config is not provided" - if port > 0: - port_retries = 1 - else: - port_retries = 100 - port = get_free_port() - while port_retries > 0: - try: - s.bind((host, port)) - break - except OSError as e: - port_retries -= 1 - if port_retries == 0: - raise RuntimeError( - f"Failed to bind socket to {host}:{port}: {e}") - else: - logger.warning( - f"Failed to bind socket to {host}:{port}: {e}, retrying {port_retries}..." - ) - port = get_free_port() + try: + s.bind((host, port)) + if port == 0: + port = s.getsockname()[1] + except OSError as e: + raise RuntimeError(f"Failed to bind socket to {host}:{port}: {e}") if backend == 'pytorch': llm_args.pop("build_config", None) From 037753f65b17660d033877af3b390a81696600ba Mon Sep 17 00:00:00 2001 From: Enwei Zhu <21126786+syuoni@users.noreply.github.com> Date: Wed, 7 Jan 2026 00:38:12 +0800 Subject: [PATCH 14/15] [https://nvbugs/5748600][ci] Unwaive disagg guided decoding test (#10409) Signed-off-by: Enwei Zhu <21126786+syuoni@users.noreply.github.com> --- tests/integration/test_lists/waives.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt index 6c0b5aedd2..fb39214c8f 100644 --- a/tests/integration/test_lists/waives.txt +++ b/tests/integration/test_lists/waives.txt @@ -421,7 +421,6 @@ accuracy/test_cli_flow.py::TestPhi3_5MiniInstruct::test_auto_dtype SKIP (https:/ unittest/_torch/auto_deploy/unit/singlegpu/models/test_llama4_vlm_patch.py::test_build_run_llama4_vlm SKIP (https://nvbugs/5747878) accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus_online_eplb[fp8kv=True-moe_backend=TRTLLM] SKIP (https://nvbugs/5740377) cpp/test_multi_gpu.py::TestDisagg::test_symmetric_executor[gpt-2proc-mpi_kvcache-90] SKIP (https://nvbugs/5755941) -accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_guided_decoding[xgrammar-mtp_nextn=0] SKIP (https://nvbugs/5748600) examples/test_bert.py::test_llm_bert_general[compare_hf-enable_remove_input_padding-use_attention_plugin-enable_context_fmha-tp:1-pp:1-float16-BertForQuestionAnswering-bert/bert-base-cased-squad2] SKIP (https://nvbugs/5608979) examples/test_bert.py::test_llm_bert_general[compare_hf-enable_remove_input_padding-use_attention_plugin-enable_context_fmha-tp:1-pp:1-float16-RobertaForQuestionAnswering-bert/roberta-base-squad2] SKIP (https://nvbugs/5608979) examples/test_bert.py::test_llm_bert_general[compare_hf-disable_remove_input_padding-use_attention_plugin-disable_context_fmha-tp:2-pp:1-float16-BertForQuestionAnswering-bert/bert-base-cased-squad2] SKIP (https://nvbugs/5608979) From 77be1b75720ac975fc7be0c8531c63422f6a2067 Mon Sep 17 00:00:00 2001 From: Mike Iovine Date: Tue, 6 Jan 2026 11:46:34 -0500 Subject: [PATCH 15/15] [https://nvbugs/5749988][fix] Remove redundant qwen3 spec dec test (#10387) Signed-off-by: Mike Iovine <6158008+mikeiovine@users.noreply.github.com> --- tests/integration/test_lists/waives.txt | 1 - .../_torch/speculative/test_eagle3.py | 112 ------------------ 2 files changed, 113 deletions(-) diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt index fb39214c8f..e50b692a3d 100644 --- a/tests/integration/test_lists/waives.txt +++ b/tests/integration/test_lists/waives.txt @@ -346,7 +346,6 @@ accuracy/test_llm_api_pytorch.py::TestNemotronNas::test_auto_dtype_tp8 SKIP (htt unittest/_torch/auto_deploy/unit/multigpu/test_ad_build_small_multi.py::test_build_ad[meta-llama/Meta-Llama-3.1-8B-Instruct-llm_extra_args0-2] SKIP (https://nvbugs/5680755) full:H100_PCIe/unittest/llmapi/test_llm_pytorch.py::test_llama_7b_multi_lora_evict_and_reload_lora_gpu_cache SKIP (https://nvbugs/5682551) unittest/_torch/speculative/test_draft_len_schedule.py::test_correctness_across_batch_sizes[model_drafter-schedule1] SKIP (https://nvbugs/5680911) -unittest/_torch/speculative/test_eagle3.py::test_qwen3_eagle3[True-True-True-True] SKIP (https://nvbugspro.nvidia.com/bug/5749988) accuracy/test_llm_api_pytorch.py::TestSeedOss_36B::test_auto_dtype SKIP (https://nvbugs/5612438) accuracy/test_llm_api_autodeploy.py::TestNemotronH::test_auto_dtype[True] SKIP (https://nvbugs/5688721) accuracy/test_llm_api_autodeploy.py::TestLlama3_1_8B::test_auto_dtype[False-2] SKIP (https://nvbugs/5769712) diff --git a/tests/unittest/_torch/speculative/test_eagle3.py b/tests/unittest/_torch/speculative/test_eagle3.py index e504c14e23..a459ae718f 100644 --- a/tests/unittest/_torch/speculative/test_eagle3.py +++ b/tests/unittest/_torch/speculative/test_eagle3.py @@ -780,117 +780,5 @@ def test_eagle3_cdl_sampling(disable_overlap_scheduler: bool): llm_spec.shutdown() -@pytest.mark.parametrize( - "enable_block_reuse,use_one_model,enable_chunked_prefill,fp8_target", [ - [True, True, True, True], - ]) -@pytest.mark.high_cuda_memory -def test_qwen3_eagle3(enable_block_reuse: bool, use_one_model: bool, - enable_chunked_prefill: bool, fp8_target: bool): - # Eagle3 one model works with overlap scheduler and block reuse. - total_mem_gb = torch.cuda.get_device_properties(0).total_memory / 1e9 - if total_mem_gb < 35: - pytest.skip("Not enough memory to load target + draft model") - - use_cuda_graph = True - attn_backend = "TRTLLM" - disable_overlap_scheduler = False - use_chain_drafter = True - multi_batch = False - attention_dp = False - - models_path = llm_models_root() - eagle_model_dir = f"{models_path}/Zhi-Create-Qwen3-32B-Eagle3" - target_model_dir = f"{models_path}/Qwen3/Qwen3-32B" - if fp8_target: - target_model_dir = f"{models_path}/Qwen3/Qwen3-32B-FP8/" - - # bs > 1 gives non-deterministic when doing IFB. There are slight chances - # that ref and spec does not match 100% - max_batch_size = 4 if multi_batch else 1 - max_draft_len = 3 - kv_cache_config = KvCacheConfig(enable_block_reuse=enable_block_reuse, - max_tokens=8192) - if fp8_target: - kv_cache_config.dtype = 'fp8' - cuda_graph_config = CudaGraphConfig( - batch_sizes=[i for i in range(1, max_batch_size + - 1)]) if use_cuda_graph else None - - llm_common_config = dict( - model=target_model_dir, - attn_backend=attn_backend, - disable_overlap_scheduler=disable_overlap_scheduler, - cuda_graph_config=cuda_graph_config, - max_batch_size=max_batch_size, - kv_cache_config=kv_cache_config, - enable_attention_dp=attention_dp, - max_seq_len=8192, - enable_chunked_prefill=enable_chunked_prefill, - ) - if enable_chunked_prefill: - # Use a small max_num_tokens so that the chunked prefill path gets exercised. - llm_common_config['max_num_tokens'] = 64 - - spec_config = EagleDecodingConfig( - max_draft_len=max_draft_len, - speculative_model_dir=eagle_model_dir, - eagle3_one_model=use_one_model, - ) - spec_config._allow_chain_drafter = use_chain_drafter - - # Create the LLM instance - llm_spec = LLM(**llm_common_config, speculative_config=spec_config) - - # Acceptance rate tests - if enable_chunked_prefill: - # Use a long prompt for chunked prefill tests. - prompts = [ - "The capital of France is a city of romance, art, fashion, and cuisine. Paris is a must-visit destination for anyone who loves history, architecture, and culture. From the iconic Eiffel Tower to the world-famous Louvre Museum, Paris has something to offer for every interest and age.\nThe city is divided into 20 arrondissements, each with its own unique character and charm. The Latin Quarter is a popular area for students and young travelers, while the Champs-Élysées is a hub for shopping and dining. The Montmartre neighborhood is famous for its bohemian vibe and stunning views of the city.\nParis is also known for its beautiful parks and gardens, such as the Luxembourg Gardens and the Tuileries Garden. The city has a rich history, with landmarks like the Notre-Dame Cathedral and the Arc de Triomphe. Visitors can also explore the city's many museums, including the Musée d'Orsay and the Musée Rodin.\nIn addition to its cultural and historical attractions, Paris is also a great destination for foodies. The city is famous for its cuisine, including croissants, baguettes, and cheese. Visitors can sample the city's famous dishes at one of the many restaurants, cafes, and " - ] - tok_ids = [llm_spec.tokenizer.encode(prompts[0])] - else: - prompts = [ - "The capital of France is", - "The president of the United States is", - ] - tok_ids = [llm_spec.tokenizer.encode("The future of AI is")] - if multi_batch: - tok_ids.append(llm_spec.tokenizer.encode(prompts)) - - sampling_params = SamplingParams(max_tokens=128, temperature=0) - for i in range(len(tok_ids)): - num_tokens = 0 - num_drafted = 0 - num_accepted = 0 - - for output in llm_spec.generate_async(tok_ids[i], - sampling_params, - streaming=True): - new_tokens = output.outputs[0].token_ids - num_drafted += max_draft_len - num_accepted += len(new_tokens) - num_tokens - 1 - num_tokens = len(new_tokens) - - accept_rate = num_accepted / num_drafted - assert accept_rate > 0.10 - - # Output tests - sampling_params = SamplingParams(max_tokens=10, temperature=0) - - results_spec = llm_spec.generate(prompts, sampling_params) - generated_text_spec = [result.outputs[0].text for result in results_spec] - llm_spec.shutdown() - - llm_ref = LLM(**llm_common_config) - results_ref = llm_ref.generate(prompts, sampling_params) - generated_text_ref = [result.outputs[0].text for result in results_ref] - llm_ref.shutdown() - - for text_spec, text_ref in zip(generated_text_spec, generated_text_ref): - # The spec decode algorithm currently guarantees identical results - assert text_spec == text_ref - - if __name__ == "__main__": unittest.main()