[ci] parallelize torch unittests (#5714)

Signed-off-by: Omer Ullman Argov <118735753+omera-nv@users.noreply.github.com>
2026-01-13 22:18:36 +08:00 · 2025-07-09 11:05:57 +03:00 · 2025-07-09 11:05:57 +03:00 · a32f7083b4
commit a32f7083b4
parent 3e3b1769ad
9 changed files with 11 additions and 0 deletions
--- a/tests/unittest/_torch/speculative/test_draft_target.py
+++ b/tests/unittest/_torch/speculative/test_draft_target.py
@ -15,6 +15,7 @@ from utils.llm_data import llm_models_root

@pytest.mark.parametrize("use_cuda_graph,attn_backend",
                         [[False, "TRTLLM"], [True, "TRTLLM"]])
+@pytest.mark.high_cuda_memory
 def test_llama_draft_target(use_cuda_graph: bool, attn_backend: str):
    total_mem_gb = torch.cuda.get_device_properties(0).total_memory / 1e9
    if total_mem_gb < 60:
--- a/tests/unittest/_torch/speculative/test_eagle3.py
+++ b/tests/unittest/_torch/speculative/test_eagle3.py
@ -22,6 +22,7 @@ from utils.llm_data import llm_models_root
        [False, "FLASHINFER", True, False, False],
        #  [False, "TRTLLM", False, True, True], [True, "TRTLLM", False, True, True] # TODO: nvbugs/5379915
    ])
+@pytest.mark.high_cuda_memory
 def test_llama_eagle3(use_cuda_graph: bool, attn_backend: str,
                      disable_overlap_scheduler: bool, enable_block_reuse: bool,
                      use_one_model: bool):
--- a/tests/unittest/_torch/speculative/test_ngram.py
+++ b/tests/unittest/_torch/speculative/test_ngram.py
@ -18,6 +18,7 @@ from utils.llm_data import llm_models_root
    "disable_overlap_scheduler,use_cuda_graph,attn_backend",
    [[True, False, "TRTLLM"], [True, True, "TRTLLM"],
     [True, False, "FLASHINFER"]])
+@pytest.mark.high_cuda_memory
 def test_llama_ngram(disable_overlap_scheduler: bool, use_cuda_graph: bool,
                     attn_backend: str):
    total_mem_gb = torch.cuda.get_device_properties(0).total_memory / 1e9
--- a/tests/unittest/_torch/test_overlap_scheduler.py
+++ b/tests/unittest/_torch/test_overlap_scheduler.py
@ -42,6 +42,7 @@ def create_llm(model_dir, disable_overlap_scheduler, enable_trtllm_sampler):


@pytest.mark.parametrize("enable_trtllm_sampler", [False, True])
+@pytest.mark.high_cuda_memory
 def test_overlap_scheduler_consistency(model_path, test_case,
                                       enable_trtllm_sampler):
    # Test configuration
--- a/tests/unittest/_torch/test_trtllm_sampler.py
+++ b/tests/unittest/_torch/test_trtllm_sampler.py
@ -41,6 +41,7 @@ def create_llm(model_dir):
    )


+@pytest.mark.high_cuda_memory
 def test_trtllm_sampler(model_path, test_case):
    prompts = [
        "Magellan and Elcano lead the first",
--- a/tests/unittest/_torch/thop/test_causal_conv1d_op.py
+++ b/tests/unittest/_torch/thop/test_causal_conv1d_op.py
@ -43,6 +43,7 @@ from tensorrt_llm.llmapi.utils import get_total_gpu_memory
                       )),
            product([5376], [4], ['context'], ['float16', 'bfloat16'], [2],
                    [131072], [False, True], [False, True], [False]))))
+@pytest.mark.high_cuda_memory
 def test_causal_conv1d(dim, dconv, req_type, dtype, batch_size, max_seq_len,
                       remove_padding, apply_silu, paged_cache):
    device = "cuda"
--- a/tests/unittest/_torch/thop/test_mamba_conv1d_op.py
+++ b/tests/unittest/_torch/thop/test_mamba_conv1d_op.py
@ -25,6 +25,7 @@ class TestFunctional(unittest.TestCase):
            product([5376], [4], ['context'], ['float16', 'bfloat16'], [2],
                    [131072], [10240], [False, True], [False, True])),
        name_func=unittest_name_func)
+    @pytest.mark.high_cuda_memory
    def test_mamba_conv1d(self, dim, dconv, req_type, dtype, batch_size,
                          max_seq_len, stride_size, remove_padding, apply_silu):
        if max_seq_len == 131072:
--- a/tests/unittest/_torch/thop/test_moe_alltoall.py
+++ b/tests/unittest/_torch/thop/test_moe_alltoall.py
@ -14,6 +14,7 @@
 # limitations under the License.
 import unittest

+import pytest
 import torch
 from parameterized import parameterized

@ -479,6 +480,7 @@ class TestMoeAlltoAllSingleGPU(unittest.TestCase):
        # Hang with stream count > 8
        #(0, 9, 90, 8, 100),
    ])
+    @pytest.mark.no_xdist
    def test_moe_alltoall_prepare(self, ep_rank: int, ep_size: int,
                                  expert_count: int, slot_count: int,
                                  top_k: int, max_token_count_per_rank: int):
--- a/tests/unittest/pytest.ini
+++ b/tests/unittest/pytest.ini
@ -19,3 +19,5 @@ markers =
    gpu2: this test uses 2 GPUs
    gpu4: this test uses 4 GPUs
    post_merge: this test should only run in post merge
+    high_cuda_memory: this test uses a lot of CUDA memory (typically more than 12GB)
+    no_xdist: this test should not run when using pytest-xdist