mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-13 22:18:36 +08:00
[ci] parallelize torch unittests (#5714)
Signed-off-by: Omer Ullman Argov <118735753+omera-nv@users.noreply.github.com>
This commit is contained in:
parent
3e3b1769ad
commit
a32f7083b4
@ -15,6 +15,7 @@ from utils.llm_data import llm_models_root
|
||||
|
||||
@pytest.mark.parametrize("use_cuda_graph,attn_backend",
|
||||
[[False, "TRTLLM"], [True, "TRTLLM"]])
|
||||
@pytest.mark.high_cuda_memory
|
||||
def test_llama_draft_target(use_cuda_graph: bool, attn_backend: str):
|
||||
total_mem_gb = torch.cuda.get_device_properties(0).total_memory / 1e9
|
||||
if total_mem_gb < 60:
|
||||
|
||||
@ -22,6 +22,7 @@ from utils.llm_data import llm_models_root
|
||||
[False, "FLASHINFER", True, False, False],
|
||||
# [False, "TRTLLM", False, True, True], [True, "TRTLLM", False, True, True] # TODO: nvbugs/5379915
|
||||
])
|
||||
@pytest.mark.high_cuda_memory
|
||||
def test_llama_eagle3(use_cuda_graph: bool, attn_backend: str,
|
||||
disable_overlap_scheduler: bool, enable_block_reuse: bool,
|
||||
use_one_model: bool):
|
||||
|
||||
@ -18,6 +18,7 @@ from utils.llm_data import llm_models_root
|
||||
"disable_overlap_scheduler,use_cuda_graph,attn_backend",
|
||||
[[True, False, "TRTLLM"], [True, True, "TRTLLM"],
|
||||
[True, False, "FLASHINFER"]])
|
||||
@pytest.mark.high_cuda_memory
|
||||
def test_llama_ngram(disable_overlap_scheduler: bool, use_cuda_graph: bool,
|
||||
attn_backend: str):
|
||||
total_mem_gb = torch.cuda.get_device_properties(0).total_memory / 1e9
|
||||
|
||||
@ -42,6 +42,7 @@ def create_llm(model_dir, disable_overlap_scheduler, enable_trtllm_sampler):
|
||||
|
||||
|
||||
@pytest.mark.parametrize("enable_trtllm_sampler", [False, True])
|
||||
@pytest.mark.high_cuda_memory
|
||||
def test_overlap_scheduler_consistency(model_path, test_case,
|
||||
enable_trtllm_sampler):
|
||||
# Test configuration
|
||||
|
||||
@ -41,6 +41,7 @@ def create_llm(model_dir):
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.high_cuda_memory
|
||||
def test_trtllm_sampler(model_path, test_case):
|
||||
prompts = [
|
||||
"Magellan and Elcano lead the first",
|
||||
|
||||
@ -43,6 +43,7 @@ from tensorrt_llm.llmapi.utils import get_total_gpu_memory
|
||||
)),
|
||||
product([5376], [4], ['context'], ['float16', 'bfloat16'], [2],
|
||||
[131072], [False, True], [False, True], [False]))))
|
||||
@pytest.mark.high_cuda_memory
|
||||
def test_causal_conv1d(dim, dconv, req_type, dtype, batch_size, max_seq_len,
|
||||
remove_padding, apply_silu, paged_cache):
|
||||
device = "cuda"
|
||||
|
||||
@ -25,6 +25,7 @@ class TestFunctional(unittest.TestCase):
|
||||
product([5376], [4], ['context'], ['float16', 'bfloat16'], [2],
|
||||
[131072], [10240], [False, True], [False, True])),
|
||||
name_func=unittest_name_func)
|
||||
@pytest.mark.high_cuda_memory
|
||||
def test_mamba_conv1d(self, dim, dconv, req_type, dtype, batch_size,
|
||||
max_seq_len, stride_size, remove_padding, apply_silu):
|
||||
if max_seq_len == 131072:
|
||||
|
||||
@ -14,6 +14,7 @@
|
||||
# limitations under the License.
|
||||
import unittest
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
from parameterized import parameterized
|
||||
|
||||
@ -479,6 +480,7 @@ class TestMoeAlltoAllSingleGPU(unittest.TestCase):
|
||||
# Hang with stream count > 8
|
||||
#(0, 9, 90, 8, 100),
|
||||
])
|
||||
@pytest.mark.no_xdist
|
||||
def test_moe_alltoall_prepare(self, ep_rank: int, ep_size: int,
|
||||
expert_count: int, slot_count: int,
|
||||
top_k: int, max_token_count_per_rank: int):
|
||||
|
||||
@ -19,3 +19,5 @@ markers =
|
||||
gpu2: this test uses 2 GPUs
|
||||
gpu4: this test uses 4 GPUs
|
||||
post_merge: this test should only run in post merge
|
||||
high_cuda_memory: this test uses a lot of CUDA memory (typically more than 12GB)
|
||||
no_xdist: this test should not run when using pytest-xdist
|
||||
|
||||
Loading…
Reference in New Issue
Block a user