[ci] parallelize torch unittests (#5714)

Signed-off-by: Omer Ullman Argov <118735753+omera-nv@users.noreply.github.com>
This commit is contained in:
Omer Ullman Argov 2025-07-09 11:05:57 +03:00 committed by GitHub
parent 3e3b1769ad
commit a32f7083b4
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
9 changed files with 11 additions and 0 deletions

View File

@ -15,6 +15,7 @@ from utils.llm_data import llm_models_root
@pytest.mark.parametrize("use_cuda_graph,attn_backend",
[[False, "TRTLLM"], [True, "TRTLLM"]])
@pytest.mark.high_cuda_memory
def test_llama_draft_target(use_cuda_graph: bool, attn_backend: str):
total_mem_gb = torch.cuda.get_device_properties(0).total_memory / 1e9
if total_mem_gb < 60:

View File

@ -22,6 +22,7 @@ from utils.llm_data import llm_models_root
[False, "FLASHINFER", True, False, False],
# [False, "TRTLLM", False, True, True], [True, "TRTLLM", False, True, True] # TODO: nvbugs/5379915
])
@pytest.mark.high_cuda_memory
def test_llama_eagle3(use_cuda_graph: bool, attn_backend: str,
disable_overlap_scheduler: bool, enable_block_reuse: bool,
use_one_model: bool):

View File

@ -18,6 +18,7 @@ from utils.llm_data import llm_models_root
"disable_overlap_scheduler,use_cuda_graph,attn_backend",
[[True, False, "TRTLLM"], [True, True, "TRTLLM"],
[True, False, "FLASHINFER"]])
@pytest.mark.high_cuda_memory
def test_llama_ngram(disable_overlap_scheduler: bool, use_cuda_graph: bool,
attn_backend: str):
total_mem_gb = torch.cuda.get_device_properties(0).total_memory / 1e9

View File

@ -42,6 +42,7 @@ def create_llm(model_dir, disable_overlap_scheduler, enable_trtllm_sampler):
@pytest.mark.parametrize("enable_trtllm_sampler", [False, True])
@pytest.mark.high_cuda_memory
def test_overlap_scheduler_consistency(model_path, test_case,
enable_trtllm_sampler):
# Test configuration

View File

@ -41,6 +41,7 @@ def create_llm(model_dir):
)
@pytest.mark.high_cuda_memory
def test_trtllm_sampler(model_path, test_case):
prompts = [
"Magellan and Elcano lead the first",

View File

@ -43,6 +43,7 @@ from tensorrt_llm.llmapi.utils import get_total_gpu_memory
)),
product([5376], [4], ['context'], ['float16', 'bfloat16'], [2],
[131072], [False, True], [False, True], [False]))))
@pytest.mark.high_cuda_memory
def test_causal_conv1d(dim, dconv, req_type, dtype, batch_size, max_seq_len,
remove_padding, apply_silu, paged_cache):
device = "cuda"

View File

@ -25,6 +25,7 @@ class TestFunctional(unittest.TestCase):
product([5376], [4], ['context'], ['float16', 'bfloat16'], [2],
[131072], [10240], [False, True], [False, True])),
name_func=unittest_name_func)
@pytest.mark.high_cuda_memory
def test_mamba_conv1d(self, dim, dconv, req_type, dtype, batch_size,
max_seq_len, stride_size, remove_padding, apply_silu):
if max_seq_len == 131072:

View File

@ -14,6 +14,7 @@
# limitations under the License.
import unittest
import pytest
import torch
from parameterized import parameterized
@ -479,6 +480,7 @@ class TestMoeAlltoAllSingleGPU(unittest.TestCase):
# Hang with stream count > 8
#(0, 9, 90, 8, 100),
])
@pytest.mark.no_xdist
def test_moe_alltoall_prepare(self, ep_rank: int, ep_size: int,
expert_count: int, slot_count: int,
top_k: int, max_token_count_per_rank: int):

View File

@ -19,3 +19,5 @@ markers =
gpu2: this test uses 2 GPUs
gpu4: this test uses 4 GPUs
post_merge: this test should only run in post merge
high_cuda_memory: this test uses a lot of CUDA memory (typically more than 12GB)
no_xdist: this test should not run when using pytest-xdist