ci: [nvbugs/5280806] Unwaive unittests/_torch. (#4951)

Signed-off-by: Yuxian Qiu <142763828+yuxianq@users.noreply.github.com>
2026-01-13 22:18:36 +08:00 · 2025-06-09 19:04:11 +08:00 · 2025-06-09 19:04:11 +08:00 · 88480197da
commit 88480197da
parent 9c012d5bf8
12 changed files with 67 additions and 47 deletions
--- a/tensorrt_llm/_torch/attention_backend/interface.py
+++ b/tensorrt_llm/_torch/attention_backend/interface.py
@ -419,7 +419,7 @@ class RopeParams:
                    self.original_max_positions,
                })
        if rope_inv_freq is not None:
-            rope_inv_freq = torch.torch.tensor(
+            rope_inv_freq = torch.tensor(
                rope_inv_freq,
                dtype=torch.float32,
                device='cuda',
@ -428,7 +428,7 @@ class RopeParams:
            rope_cos_sin = rope_cos_sin.reshape(
                self.max_positions, -1,
                2)[:, :self.dim // 2, :].transpose(0, 2, 1).reshape(1, -1)
-        rope_cos_sin = torch.torch.tensor(
+        rope_cos_sin = torch.tensor(
            rope_cos_sin,
            dtype=torch.float32,
            device='cuda',
--- a/tests/integration/test_lists/waives.txt
+++ b/tests/integration/test_lists/waives.txt
@ -373,7 +373,6 @@ full:B200/examples/test_qwen.py::test_llm_qwen_7b_multi_gpus_summary[qwen2.5_7b_
 accuracy/test_cli_flow.py::TestMixtral8x22B::test_int8_plugin_tp8[renormalize-tensor_parallel] SKIP (https://nvbugs/5273695)
 examples/test_gpt.py::test_starcoder_fp8_quantization_2gpu[starcoder] SKIP (https://nvbugs/5144931)
 examples/test_gpt.py::test_starcoder_fp8_quantization_2gpu[starcoderplus] SKIP (https://nvbugs/5144931)
-unittest/_torch -k "not (modeling or multi_gpu or auto_deploy)" SKIP (https://nvbugs/5280806)
 examples/test_whisper.py::test_llm_whisper_general[large-v3-disable_gemm_plugin-disable_attention_plugin-disable_weight_only-float16-nb:1-use_python_runtime] SKIP (https://nvbugs/5244570)
 triton_server/test_triton_rcca.py::test_mistral_beam_search[rcca_4714407-True-10---False-True-False-0-128-disableDecoupleMode-inflight_fused_batching-disableTrtOverlap--guaranteed_no_evict---1-1-1-False-ensemble] SKIP (https://nvbugs/5240060)
 triton_server/test_triton.py::test_triton_extensive[triton-extensive] SKIP
--- a/tests/unittest/_torch/modules/test_fused_moe.py
+++ b/tests/unittest/_torch/modules/test_fused_moe.py
@ -314,6 +314,7 @@ def test_fused_moe_nvfp4(dtype):
    torch.testing.assert_close(output, ref_output, rtol=1e-2, atol=0.1)


+@pytest.mark.skip(reason="https://nvbugs/5325653")
@skip_neither_ada_nor_hopper_unittest
@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
 def test_fused_moe_w4afp8(dtype):
--- a/tests/unittest/_torch/modules/test_moe_load_balancer.py
+++ b/tests/unittest/_torch/modules/test_moe_load_balancer.py
@ -1,6 +1,7 @@
 import unittest
 from unittest.mock import MagicMock, patch

+import pytest
 import torch
 from mpi4py import MPI

@ -178,6 +179,7 @@ class TestMoeLoadBalancer(unittest.TestCase):
        # Verify the global state is cleaned up
        self.assertIsNone(get_moe_load_balancer())

+    @pytest.mark.skip(reason="https://nvbugs/5324229")
    @patch('tensorrt_llm.bindings.internal.runtime.MoeLoadBalancer')
    def test_single_layer_moe_load_balancer_methods(self,
                                                    mock_load_balancer_impl):
--- a/tests/unittest/_torch/speculative/test_eagle3.py
+++ b/tests/unittest/_torch/speculative/test_eagle3.py
@ -13,6 +13,7 @@ sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
 from utils.llm_data import llm_models_root


+@pytest.mark.skip(reason="https://nvbugs/5280806")
@pytest.mark.parametrize("use_cuda_graph,attn_backend",
                         [[True, "TRTLLM"], [False, "TRTLLM"],
                          [True, "FLASHINFER"], [False, "FLASHINFER"]])
--- a/tests/unittest/_torch/speculative/test_ngram.py
+++ b/tests/unittest/_torch/speculative/test_ngram.py
@ -16,6 +16,7 @@ from utils.llm_data import llm_models_root
 # TODO: Add cuda graph enabled tests.
 # Cuda graph cannot currently be enabled for ngram because cuda graph requires
 # spec metadata and ngram does not have it.
+@pytest.mark.skip(reason="https://nvbugs/5324239")
@pytest.mark.parametrize("use_cuda_graph,attn_backend",
                         [[False, "TRTLLM"], [False, "FLASHINFER"]])
 def test_llama_ngram(use_cuda_graph: bool, attn_backend: str):
--- a/tests/unittest/_torch/test_fp8_per_tensor_scale_tllmg_gemm.py
+++ b/tests/unittest/_torch/test_fp8_per_tensor_scale_tllmg_gemm.py
@ -43,6 +43,9 @@ from tensorrt_llm.quantization.utils.fp4_utils import (
    ["low-latency", "throughput"],
 )
 def test_fp8_block_scale_gemm(dtype, m, k, n, inference_mode):
+    if inference_mode == "low-latency" and dtype == torch.bfloat16:
+        pytest.skip("https://nvbugs/5328141")
+
    torch.random.manual_seed(0)
    a = torch.randn((m, k), device='cuda', dtype=torch.float)
    b = torch.randn((n, k), device='cuda', dtype=torch.float)
--- a/tests/unittest/_torch/test_pytorch_model_engine.py
+++ b/tests/unittest/_torch/test_pytorch_model_engine.py
@ -1,6 +1,7 @@
 import unittest
 from dataclasses import dataclass

+import pytest
 import torch

 import tensorrt_llm
@ -126,6 +127,7 @@ def create_model_engine_and_kvcache(config: PyTorchConfig = None):
    return model_engine, kv_cache_manager


+@pytest.mark.skip(reason="https://nvbugs/5324248")
 class PyTorchModelEngineTestCase(unittest.TestCase):

    def test_pad_generation_requests(self) -> None:
--- a/tests/unittest/_torch/test_resource_manager.py
+++ b/tests/unittest/_torch/test_resource_manager.py
@ -5,6 +5,7 @@ import sys
 import unittest

 import numpy as np
+import pytest
 import torch

 import tensorrt_llm
@ -338,6 +339,7 @@ class TestResourceManager(unittest.TestCase):

        self.assertEqual(len(peft_table), self.num_lora_modules)

+    @pytest.mark.skip(reason="https://nvbugs/5324252")
    def test_put_get(self):
        """Test adding a request with properly configured LoRA weights and config."""
        peft_cache_config = self.create_peft_cache_config()
--- a/tests/unittest/_torch/test_return_logits.py
+++ b/tests/unittest/_torch/test_return_logits.py
@ -54,6 +54,7 @@ def test_LlmResponse_pickle():
    assert pickle_result.log_probs == logprobs


+@pytest.mark.skip(reason="https://nvbugs/5327892")
@force_ampere  # Save H100 resource
@pytest.mark.parametrize("return_log_probs", [False, True])
@pytest.mark.parametrize("gather_generation_logits", [False, True])
@ -96,31 +97,35 @@ def test_generate_with_return_logits(disable_overlap_scheduler: bool,
        top_p=1,
        return_context_logits=gather_context_logits,
        return_generation_logits=gather_generation_logits,
-        logprobs=return_log_probs)
+        logprobs=return_log_probs,
+    )

-    for output in llm.generate(prompts, sampling_params=sampling_params):
-        if gather_context_logits:
-            assert output.context_logits is not None
-            assert len(prompts[0].split()) == output.context_logits.shape[0]
-        else:
-            assert output.context_logits is None
+    with llm:
+        for output in llm.generate(prompts, sampling_params=sampling_params):
+            if gather_context_logits:
+                assert output.context_logits is not None
+                assert len(prompts[0].split()) == output.context_logits.shape[0]
+            else:
+                assert output.context_logits is None

-        if gather_generation_logits:
-            gen_logits = output.outputs[0].generation_logits
-            assert gen_logits is not None
-            assert gen_logits.ndim == 2
-            assert gen_logits.shape[0] == sampling_params.max_tokens
-            assert torch.argmax(gen_logits,
-                                dim=1).tolist() == output.outputs[0].token_ids
-        else:
-            assert output.outputs[0].generation_logits is None
+            if gather_generation_logits:
+                gen_logits = output.outputs[0].generation_logits
+                assert gen_logits is not None
+                assert gen_logits.ndim == 2
+                assert gen_logits.shape[0] == sampling_params.max_tokens
+                assert torch.argmax(
+                    gen_logits, dim=1).tolist() == output.outputs[0].token_ids
+            else:
+                assert output.outputs[0].generation_logits is None

-        if return_log_probs:
-            assert len(output.outputs[0].logprobs) == sampling_params.max_tokens
-        else:
-            assert len(output.outputs[0].logprobs) == 0
+            if return_log_probs:
+                assert len(
+                    output.outputs[0].logprobs) == sampling_params.max_tokens
+            else:
+                assert len(output.outputs[0].logprobs) == 0


+@pytest.mark.skip(reason="https://nvbugs/5327892")
@force_ampere  # Save H100 resource
@pytest.mark.parametrize("return_log_probs", [False, True])
@pytest.mark.parametrize("gather_generation_logits", [False, True])
@ -164,28 +169,29 @@ def test_generate_async_with_return_logits(disable_overlap_scheduler: bool,
        return_generation_logits=gather_generation_logits,
        logprobs=return_log_probs)

-    for idx, output in enumerate(
-            llm.generate_async(prompts[0],
-                               sampling_params=sampling_params,
-                               streaming=True)):
-        if gather_context_logits:
-            assert output.context_logits is not None
-            assert len(prompts[0].split()) == output.context_logits.shape[0]
-        else:
-            assert output.context_logits is None
+    with llm:
+        for idx, output in enumerate(
+                llm.generate_async(prompts[0],
+                                   sampling_params=sampling_params,
+                                   streaming=True)):
+            if gather_context_logits:
+                assert output.context_logits is not None
+                assert len(prompts[0].split()) == output.context_logits.shape[0]
+            else:
+                assert output.context_logits is None

-        if gather_generation_logits:
-            gen_logits = output.outputs[0].generation_logits
-            assert gen_logits is not None
-            assert gen_logits.ndim == 2
-            assert gen_logits.shape[0] == 1
-            assert torch.argmax(
-                gen_logits,
-                dim=1).tolist()[0] == output.outputs[0].token_ids[-1]
-        else:
-            assert output.outputs[0].generation_logits is None
+            if gather_generation_logits:
+                gen_logits = output.outputs[0].generation_logits
+                assert gen_logits is not None
+                assert gen_logits.ndim == 2
+                assert gen_logits.shape[0] == 1
+                assert torch.argmax(
+                    gen_logits,
+                    dim=1).tolist()[0] == output.outputs[0].token_ids[-1]
+            else:
+                assert output.outputs[0].generation_logits is None

-        if return_log_probs:
-            assert len(output.outputs[0].logprobs) == idx + 1
-        else:
-            assert len(output.outputs[0].logprobs) == 0
+            if return_log_probs:
+                assert len(output.outputs[0].logprobs) == idx + 1
+            else:
+                assert len(output.outputs[0].logprobs) == 0
--- a/tests/unittest/_torch/thop/test_fp8_block_scale_gemm.py
+++ b/tests/unittest/_torch/thop/test_fp8_block_scale_gemm.py
@ -41,6 +41,9 @@ from utils.util import getSMVersion
    [torch.bfloat16],
 )
 def test_fp8_block_scale_gemm(dtype, m, k, n):
+    if getSMVersion() == 89 and k == 7168 and n == 2112:
+        pytest.skip("https://nvbugs/5328184")
+
    torch.random.manual_seed(0)
    a = torch.randn((m, k), device='cuda', dtype=dtype) / k
    b = torch.randn((n, k), device='cuda', dtype=dtype) / k
--- a/tests/unittest/_torch/thop/test_selective_scan_op.py
+++ b/tests/unittest/_torch/thop/test_selective_scan_op.py
@ -235,7 +235,7 @@ class TestFunctional(unittest.TestCase):
    def test_selective_scan_v2(self, dim, headdim, ngroups, dstate, req_type,
                               dtype, batch_size, max_seq_len, has_z,
                               remove_padding):
-
+        pytest.skip("https://nvbugs/5324258")
        if dtype == 'float32' and req_type == 'context':
            pytest.skip(
                "Mamba2 chunk scan kernel only support float16 and bfloat16")