mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-14 06:27:45 +08:00
[ci] parallelize torch unittests (#5714)
Signed-off-by: Omer Ullman Argov <118735753+omera-nv@users.noreply.github.com>
This commit is contained in:
parent
3e3b1769ad
commit
a32f7083b4
@ -15,6 +15,7 @@ from utils.llm_data import llm_models_root
|
|||||||
|
|
||||||
@pytest.mark.parametrize("use_cuda_graph,attn_backend",
|
@pytest.mark.parametrize("use_cuda_graph,attn_backend",
|
||||||
[[False, "TRTLLM"], [True, "TRTLLM"]])
|
[[False, "TRTLLM"], [True, "TRTLLM"]])
|
||||||
|
@pytest.mark.high_cuda_memory
|
||||||
def test_llama_draft_target(use_cuda_graph: bool, attn_backend: str):
|
def test_llama_draft_target(use_cuda_graph: bool, attn_backend: str):
|
||||||
total_mem_gb = torch.cuda.get_device_properties(0).total_memory / 1e9
|
total_mem_gb = torch.cuda.get_device_properties(0).total_memory / 1e9
|
||||||
if total_mem_gb < 60:
|
if total_mem_gb < 60:
|
||||||
|
|||||||
@ -22,6 +22,7 @@ from utils.llm_data import llm_models_root
|
|||||||
[False, "FLASHINFER", True, False, False],
|
[False, "FLASHINFER", True, False, False],
|
||||||
# [False, "TRTLLM", False, True, True], [True, "TRTLLM", False, True, True] # TODO: nvbugs/5379915
|
# [False, "TRTLLM", False, True, True], [True, "TRTLLM", False, True, True] # TODO: nvbugs/5379915
|
||||||
])
|
])
|
||||||
|
@pytest.mark.high_cuda_memory
|
||||||
def test_llama_eagle3(use_cuda_graph: bool, attn_backend: str,
|
def test_llama_eagle3(use_cuda_graph: bool, attn_backend: str,
|
||||||
disable_overlap_scheduler: bool, enable_block_reuse: bool,
|
disable_overlap_scheduler: bool, enable_block_reuse: bool,
|
||||||
use_one_model: bool):
|
use_one_model: bool):
|
||||||
|
|||||||
@ -18,6 +18,7 @@ from utils.llm_data import llm_models_root
|
|||||||
"disable_overlap_scheduler,use_cuda_graph,attn_backend",
|
"disable_overlap_scheduler,use_cuda_graph,attn_backend",
|
||||||
[[True, False, "TRTLLM"], [True, True, "TRTLLM"],
|
[[True, False, "TRTLLM"], [True, True, "TRTLLM"],
|
||||||
[True, False, "FLASHINFER"]])
|
[True, False, "FLASHINFER"]])
|
||||||
|
@pytest.mark.high_cuda_memory
|
||||||
def test_llama_ngram(disable_overlap_scheduler: bool, use_cuda_graph: bool,
|
def test_llama_ngram(disable_overlap_scheduler: bool, use_cuda_graph: bool,
|
||||||
attn_backend: str):
|
attn_backend: str):
|
||||||
total_mem_gb = torch.cuda.get_device_properties(0).total_memory / 1e9
|
total_mem_gb = torch.cuda.get_device_properties(0).total_memory / 1e9
|
||||||
|
|||||||
@ -42,6 +42,7 @@ def create_llm(model_dir, disable_overlap_scheduler, enable_trtllm_sampler):
|
|||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("enable_trtllm_sampler", [False, True])
|
@pytest.mark.parametrize("enable_trtllm_sampler", [False, True])
|
||||||
|
@pytest.mark.high_cuda_memory
|
||||||
def test_overlap_scheduler_consistency(model_path, test_case,
|
def test_overlap_scheduler_consistency(model_path, test_case,
|
||||||
enable_trtllm_sampler):
|
enable_trtllm_sampler):
|
||||||
# Test configuration
|
# Test configuration
|
||||||
|
|||||||
@ -41,6 +41,7 @@ def create_llm(model_dir):
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.high_cuda_memory
|
||||||
def test_trtllm_sampler(model_path, test_case):
|
def test_trtllm_sampler(model_path, test_case):
|
||||||
prompts = [
|
prompts = [
|
||||||
"Magellan and Elcano lead the first",
|
"Magellan and Elcano lead the first",
|
||||||
|
|||||||
@ -43,6 +43,7 @@ from tensorrt_llm.llmapi.utils import get_total_gpu_memory
|
|||||||
)),
|
)),
|
||||||
product([5376], [4], ['context'], ['float16', 'bfloat16'], [2],
|
product([5376], [4], ['context'], ['float16', 'bfloat16'], [2],
|
||||||
[131072], [False, True], [False, True], [False]))))
|
[131072], [False, True], [False, True], [False]))))
|
||||||
|
@pytest.mark.high_cuda_memory
|
||||||
def test_causal_conv1d(dim, dconv, req_type, dtype, batch_size, max_seq_len,
|
def test_causal_conv1d(dim, dconv, req_type, dtype, batch_size, max_seq_len,
|
||||||
remove_padding, apply_silu, paged_cache):
|
remove_padding, apply_silu, paged_cache):
|
||||||
device = "cuda"
|
device = "cuda"
|
||||||
|
|||||||
@ -25,6 +25,7 @@ class TestFunctional(unittest.TestCase):
|
|||||||
product([5376], [4], ['context'], ['float16', 'bfloat16'], [2],
|
product([5376], [4], ['context'], ['float16', 'bfloat16'], [2],
|
||||||
[131072], [10240], [False, True], [False, True])),
|
[131072], [10240], [False, True], [False, True])),
|
||||||
name_func=unittest_name_func)
|
name_func=unittest_name_func)
|
||||||
|
@pytest.mark.high_cuda_memory
|
||||||
def test_mamba_conv1d(self, dim, dconv, req_type, dtype, batch_size,
|
def test_mamba_conv1d(self, dim, dconv, req_type, dtype, batch_size,
|
||||||
max_seq_len, stride_size, remove_padding, apply_silu):
|
max_seq_len, stride_size, remove_padding, apply_silu):
|
||||||
if max_seq_len == 131072:
|
if max_seq_len == 131072:
|
||||||
|
|||||||
@ -14,6 +14,7 @@
|
|||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
import unittest
|
import unittest
|
||||||
|
|
||||||
|
import pytest
|
||||||
import torch
|
import torch
|
||||||
from parameterized import parameterized
|
from parameterized import parameterized
|
||||||
|
|
||||||
@ -479,6 +480,7 @@ class TestMoeAlltoAllSingleGPU(unittest.TestCase):
|
|||||||
# Hang with stream count > 8
|
# Hang with stream count > 8
|
||||||
#(0, 9, 90, 8, 100),
|
#(0, 9, 90, 8, 100),
|
||||||
])
|
])
|
||||||
|
@pytest.mark.no_xdist
|
||||||
def test_moe_alltoall_prepare(self, ep_rank: int, ep_size: int,
|
def test_moe_alltoall_prepare(self, ep_rank: int, ep_size: int,
|
||||||
expert_count: int, slot_count: int,
|
expert_count: int, slot_count: int,
|
||||||
top_k: int, max_token_count_per_rank: int):
|
top_k: int, max_token_count_per_rank: int):
|
||||||
|
|||||||
@ -19,3 +19,5 @@ markers =
|
|||||||
gpu2: this test uses 2 GPUs
|
gpu2: this test uses 2 GPUs
|
||||||
gpu4: this test uses 4 GPUs
|
gpu4: this test uses 4 GPUs
|
||||||
post_merge: this test should only run in post merge
|
post_merge: this test should only run in post merge
|
||||||
|
high_cuda_memory: this test uses a lot of CUDA memory (typically more than 12GB)
|
||||||
|
no_xdist: this test should not run when using pytest-xdist
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user