diff --git a/tests/integration/test_lists/test-db/l0_dgx_h100.yml b/tests/integration/test_lists/test-db/l0_dgx_h100.yml index e6f7969019..df88481b32 100644 --- a/tests/integration/test_lists/test-db/l0_dgx_h100.yml +++ b/tests/integration/test_lists/test-db/l0_dgx_h100.yml @@ -15,7 +15,7 @@ l0_dgx_h100: auto_trigger: others tests: # ------------- PyTorch tests --------------- - - unittest/_torch/multi_gpu TIMEOUT (90) + - unittest/_torch/multi_gpu -m "not post_merge" TIMEOUT (90) - unittest/_torch/auto_deploy/unit/multigpu - unittest/llmapi/test_llm_multi_gpu_pytorch.py -m "gpu4 or gpu2" - accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_bfloat16_4gpus[tp4-attn_backend=TRTLLM-torch_compile=False] diff --git a/tests/integration/test_lists/test-db/l0_dgx_h200.yml b/tests/integration/test_lists/test-db/l0_dgx_h200.yml index baa9ddbe29..f8058cb5b6 100644 --- a/tests/integration/test_lists/test-db/l0_dgx_h200.yml +++ b/tests/integration/test_lists/test-db/l0_dgx_h200.yml @@ -89,6 +89,7 @@ l0_dgx_h200: - accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_guided_decoding_4gpus[llguidance] - test_e2e.py::test_trtllm_bench_llmapi_launch[pytorch_backend-llama-v3-llama3-8b] - test_e2e.py::test_trtllm_bench_mgmn + - unittest/_torch/multi_gpu -m "post_merge" TIMEOUT (90) - condition: ranges: system_gpu_count: diff --git a/tests/unittest/_torch/multi_gpu/test_embedding.py b/tests/unittest/_torch/multi_gpu/test_embedding.py index 68bc744123..a147a51b49 100644 --- a/tests/unittest/_torch/multi_gpu/test_embedding.py +++ b/tests/unittest/_torch/multi_gpu/test_embedding.py @@ -6,7 +6,6 @@ import cloudpickle import pytest import torch from mpi4py import MPI -from mpi4py.futures import MPIPoolExecutor from torch import nn import tensorrt_llm @@ -21,6 +20,9 @@ MPI.pickle.__init__( pickle.HIGHEST_PROTOCOL, ) +# needed since we reuse the mpi executor pool, first test running will leak a thread +pytestmark = pytest.mark.threadleak(enabled=False) + def run_single_rank(tensor_parallel_size, single_rank_forward_func, input, weights, vocab_size, hidden_size, dtype): @@ -193,92 +195,81 @@ def row_lm_head_forward(x, vocab_size, hidden_size, dtype, tensor_parallel_size, reason='needs 2 GPUs to run this test') @pytest.mark.parametrize("vocab_size", [128, 127], ids=["balanced", "unbalanced"]) -def test_column_embedding(vocab_size): +@pytest.mark.parametrize("mpi_pool_executor", [2], indirect=True) +def test_column_embedding(vocab_size, mpi_pool_executor): torch.manual_seed(42) seq_len = 10 hidden_size = 16 dtype = torch.bfloat16 - tensor_parallel_size = 2 + tensor_parallel_size = mpi_pool_executor.num_workers input = torch.randint(0, vocab_size, (seq_len, )) weight = torch.randn((vocab_size, hidden_size), dtype=dtype) - with MPIPoolExecutor(max_workers=tensor_parallel_size) as executor: - results = executor.map( - run_single_rank, - *zip(*[(tensor_parallel_size, column_embedding_forward, input, - weight, vocab_size, hidden_size, dtype)] * 2)) - for r in results: - assert r is True + results = mpi_pool_executor.map( + run_single_rank, + *zip(*[(tensor_parallel_size, column_embedding_forward, input, weight, + vocab_size, hidden_size, dtype)] * 2)) + for r in results: + assert r is True @pytest.mark.skipif(torch.cuda.device_count() < 2, reason='needs 2 GPUs to run this test') @pytest.mark.parametrize("hidden_size", [16, 15], ids=["balanced", "unbalanced"]) -def test_row_embedding(hidden_size): +@pytest.mark.parametrize("mpi_pool_executor", [2], indirect=True) +def test_row_embedding(hidden_size, mpi_pool_executor): torch.manual_seed(42) seq_len = 2 vocab_size = 128 dtype = torch.bfloat16 - tensor_parallel_size = 2 + tensor_parallel_size = mpi_pool_executor.num_workers input = torch.randint(0, vocab_size, (seq_len, )) weight = torch.randn((vocab_size, hidden_size), dtype=dtype) - with MPIPoolExecutor(max_workers=tensor_parallel_size) as executor: - results = executor.map( - run_single_rank, - *zip(*[(tensor_parallel_size, row_embedding_forward, input, weight, - vocab_size, hidden_size, dtype)] * 2)) - for r in results: - assert r is True + results = mpi_pool_executor.map( + run_single_rank, + *zip(*[(tensor_parallel_size, row_embedding_forward, input, weight, + vocab_size, hidden_size, dtype)] * 2)) + for r in results: + assert r is True @pytest.mark.skipif(torch.cuda.device_count() < 2, reason='needs 2 GPUs to run this test') @pytest.mark.parametrize("vocab_size", [128, 127], ids=["balanced", "unbalanced"]) -def test_column_lm_head(vocab_size): +@pytest.mark.parametrize("mpi_pool_executor", [2], indirect=True) +def test_column_lm_head(vocab_size, mpi_pool_executor): torch.manual_seed(42) seq_len = 10 hidden_size = 16 dtype = torch.bfloat16 - tensor_parallel_size = 2 + tensor_parallel_size = mpi_pool_executor.num_workers input = torch.randn((seq_len, hidden_size), dtype=dtype) weight = torch.randn((vocab_size, hidden_size), dtype=dtype) - with MPIPoolExecutor(max_workers=tensor_parallel_size) as executor: - results = executor.map( - run_single_rank, - *zip(*[(tensor_parallel_size, column_lm_head_forward, input, weight, - vocab_size, hidden_size, dtype)] * 2)) - for r in results: - assert r is True + results = mpi_pool_executor.map( + run_single_rank, + *zip(*[(tensor_parallel_size, column_lm_head_forward, input, weight, + vocab_size, hidden_size, dtype)] * 2)) + for r in results: + assert r is True @pytest.mark.skipif(torch.cuda.device_count() < 2, reason='needs 2 GPUs to run this test') @pytest.mark.parametrize("hidden_size", [16, 15], ids=["balanced", "unbalanced"]) -def test_row_lm_head(hidden_size): +@pytest.mark.parametrize("mpi_pool_executor", [2], indirect=True) +def test_row_lm_head(hidden_size, mpi_pool_executor): torch.manual_seed(42) seq_len = 2 vocab_size = 128 dtype = torch.bfloat16 - tensor_parallel_size = 2 + tensor_parallel_size = mpi_pool_executor.num_workers input = torch.randn((seq_len, hidden_size), dtype=dtype) weight = torch.randn((vocab_size, hidden_size), dtype=dtype) - with MPIPoolExecutor(max_workers=tensor_parallel_size) as executor: - results = executor.map( - run_single_rank, - *zip(*[(tensor_parallel_size, row_lm_head_forward, input, weight, - vocab_size, hidden_size, dtype)] * 2)) - for r in results: - assert r is True - - -if __name__ == '__main__': - test_column_embedding(128) - test_column_embedding(127) - test_row_embedding(16) - test_row_embedding(15) - test_column_lm_head(128) - test_column_lm_head(127) - test_row_lm_head(16) - test_row_lm_head(15) + results = mpi_pool_executor.map( + run_single_rank, + *zip(*[(tensor_parallel_size, row_lm_head_forward, input, weight, + vocab_size, hidden_size, dtype)] * 2)) + for r in results: + assert r is True diff --git a/tests/unittest/_torch/multi_gpu/test_linear.py b/tests/unittest/_torch/multi_gpu/test_linear.py index 40cbf12058..d78dc4defa 100644 --- a/tests/unittest/_torch/multi_gpu/test_linear.py +++ b/tests/unittest/_torch/multi_gpu/test_linear.py @@ -6,7 +6,6 @@ import cloudpickle import pytest import torch from mpi4py import MPI -from mpi4py.futures import MPIPoolExecutor from torch import nn import tensorrt_llm @@ -21,6 +20,9 @@ MPI.pickle.__init__( pickle.HIGHEST_PROTOCOL, ) +# needed since we reuse the mpi executor pool, first test running will leak a thread +pytestmark = pytest.mark.threadleak(enabled=False) + def rms_norm(x: torch.Tensor, weight: torch.Tensor = None, eps: float = 1e-6): y = x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + eps) @@ -246,100 +248,88 @@ def row_linear_norm_fusion_forward(x, hidden_size, dtype, tensor_parallel_size, @pytest.mark.skipif(torch.cuda.device_count() < 2, reason='needs 2 GPUs to run this test') -def test_mlp(): +@pytest.mark.parametrize("mpi_pool_executor", [2], indirect=True) +def test_mlp(mpi_pool_executor): torch.manual_seed(42) seq_len = 2 hidden_size = 16 dtype = torch.bfloat16 - tensor_parallel_size = 2 + tensor_parallel_size = mpi_pool_executor.num_workers x = torch.randn((seq_len, hidden_size), dtype=dtype) l0_weight = torch.randn((4 * hidden_size, hidden_size), dtype=dtype) l1_weight = torch.randn((hidden_size, 4 * hidden_size), dtype=dtype) - with MPIPoolExecutor(max_workers=tensor_parallel_size) as executor: - results = executor.map( - run_single_rank, - *zip(*[(tensor_parallel_size, mlp_forward, x, - [l0_weight, l1_weight], hidden_size, dtype)] * 2)) - for r in results: - assert r is True + results = mpi_pool_executor.map( + run_single_rank, + *zip(*[(tensor_parallel_size, mlp_forward, x, [l0_weight, l1_weight], + hidden_size, dtype)] * 2)) + for r in results: + assert r is True @pytest.mark.skipif(torch.cuda.device_count() < 2, reason='needs 2 GPUs to run this test') @pytest.mark.parametrize("hidden_size", [128, 127], ids=["balanced", "unbalanced"]) -def test_column_linear(hidden_size): +@pytest.mark.parametrize("mpi_pool_executor", [2], indirect=True) +def test_column_linear(hidden_size, mpi_pool_executor): torch.manual_seed(42) seq_len = 10 dtype = torch.bfloat16 - tensor_parallel_size = 2 + tensor_parallel_size = mpi_pool_executor.num_workers x = torch.randn((seq_len, hidden_size), dtype=dtype) l0_weight = torch.randn((hidden_size, hidden_size), dtype=dtype) - with MPIPoolExecutor(max_workers=tensor_parallel_size) as executor: - results = executor.map( - run_single_rank, - *zip(*[(tensor_parallel_size, column_linear_forward, x, [l0_weight], - hidden_size, dtype)] * 2)) - if hidden_size % 2 != 0: - with pytest.raises(AssertionError): - for r in results: - assert r is True - else: + results = mpi_pool_executor.map( + run_single_rank, + *zip(*[(tensor_parallel_size, column_linear_forward, x, [l0_weight], + hidden_size, dtype)] * 2)) + if hidden_size % 2 != 0: + with pytest.raises(AssertionError): for r in results: assert r is True + else: + for r in results: + assert r is True @pytest.mark.skipif(torch.cuda.device_count() < 2, reason='needs 2 GPUs to run this test') @pytest.mark.parametrize("hidden_size", [16, 15], ids=["balanced", "unbalanced"]) -def test_row_linear(hidden_size): +@pytest.mark.parametrize("mpi_pool_executor", [2], indirect=True) +def test_row_linear(hidden_size, mpi_pool_executor): torch.manual_seed(42) seq_len = 2 dtype = torch.bfloat16 - tensor_parallel_size = 2 + tensor_parallel_size = mpi_pool_executor.num_workers x = torch.randn((seq_len, hidden_size), dtype=dtype) l0_weight = torch.randn((hidden_size, hidden_size), dtype=dtype) - with MPIPoolExecutor(max_workers=tensor_parallel_size) as executor: - results = executor.map( - run_single_rank, - *zip(*[(tensor_parallel_size, row_linear_forward, x, [l0_weight], - hidden_size, dtype)] * 2)) - if hidden_size % 2 != 0: - with pytest.raises(AssertionError): - for r in results: - assert r is True - else: + results = mpi_pool_executor.map( + run_single_rank, + *zip(*[(tensor_parallel_size, row_linear_forward, x, [l0_weight], + hidden_size, dtype)] * 2)) + if hidden_size % 2 != 0: + with pytest.raises(AssertionError): for r in results: assert r is True + else: + for r in results: + assert r is True @pytest.mark.skipif(torch.cuda.device_count() < 2, reason='needs 2 GPUs to run this test') @pytest.mark.parametrize("seq_len", [2, 32], ids=lambda x: f"seqlen:{x}") @pytest.mark.parametrize("hidden_size", [16, 256], ids=lambda x: f"hidden:{x}") -def test_row_linear_norm_fusion(seq_len, hidden_size): +@pytest.mark.parametrize("mpi_pool_executor", [2], indirect=True) +def test_row_linear_norm_fusion(seq_len, hidden_size, mpi_pool_executor): torch.manual_seed(42) dtype = torch.bfloat16 - tensor_parallel_size = 2 + tensor_parallel_size = mpi_pool_executor.num_workers x = torch.randn((seq_len, hidden_size), dtype=dtype) l0_weight = torch.randn((hidden_size, hidden_size), dtype=dtype) - with MPIPoolExecutor(max_workers=tensor_parallel_size) as executor: - results = executor.map( - run_single_rank, - *zip(*[(tensor_parallel_size, row_linear_norm_fusion_forward, x, - [l0_weight], hidden_size, dtype)] * 2)) - for r in results: - assert r is True - - -if __name__ == '__main__': - test_column_linear(128) - test_column_linear(127) - test_row_linear(16) - test_row_linear(15) - test_mlp() - test_row_linear_norm_fusion(32, 256) - test_row_linear_norm_fusion(32, 16) - test_row_linear_norm_fusion(2, 16) - test_row_linear_norm_fusion(2, 256) + results = mpi_pool_executor.map( + run_single_rank, + *zip(*[(tensor_parallel_size, row_linear_norm_fusion_forward, x, + [l0_weight], hidden_size, dtype)] * 2)) + for r in results: + assert r is True diff --git a/tests/unittest/_torch/multi_gpu/test_star_attention.py b/tests/unittest/_torch/multi_gpu/test_star_attention.py index 9cf12a2c28..9a106510de 100644 --- a/tests/unittest/_torch/multi_gpu/test_star_attention.py +++ b/tests/unittest/_torch/multi_gpu/test_star_attention.py @@ -13,6 +13,7 @@ from tensorrt_llm.models.modeling_utils import QuantAlgo, QuantConfig MAX_SEQ_LEN = 4096 + 1024 +@pytest.mark.post_merge @pytest.mark.parametrize("backend", ["pytorch"]) @pytest.mark.parametrize("model_name", ["llama-models-v3/Llama-3-8B-Instruct-Gradient-1048k"], diff --git a/tests/unittest/llmapi/test_llm.py b/tests/unittest/llmapi/test_llm.py index 9196fac147..95b17d460d 100644 --- a/tests/unittest/llmapi/test_llm.py +++ b/tests/unittest/llmapi/test_llm.py @@ -1289,7 +1289,7 @@ def test_executor_lookahead_decoding_config(): assert sampling_params.lookahead_config.max_verification_set_size == 8 -def llama_v2_13b_lora_test_harness(**llm_kwargs): +def llama_v2_13b_lora_from_dir_test_harness(**llm_kwargs): # Shahar- perhaps disable build config hf_model_dir = get_model_path("llama-models-v2/llama-v2-13b-hf") hf_lora_dir = get_model_path("llama-models-v2/chinese-llama-2-lora-13b") @@ -1321,7 +1321,7 @@ def llama_v2_13b_lora_test_harness(**llm_kwargs): assert similar(output.outputs[0].text, ref) -def llama_7b_multi_lora_test_harness(**llm_kwargs): +def llama_7b_multi_lora_from_request_test_harness(**llm_kwargs): hf_model_dir = get_model_path("llama-models/llama-7b-hf") hf_lora_dir1 = get_model_path("llama-models/luotuo-lora-7b-0.1") hf_lora_dir2 = get_model_path("llama-models/Japanese-Alpaca-LoRA-7b-v0") @@ -1376,12 +1376,12 @@ def llama_7b_multi_lora_test_harness(**llm_kwargs): @skip_gpu_memory_less_than_40gb def test_llama_v2_13b_lora(): - llama_v2_13b_lora_test_harness() + llama_v2_13b_lora_from_dir_test_harness() @skip_gpu_memory_less_than_40gb def test_llama_7b_multi_lora(): - llama_7b_multi_lora_test_harness(max_loras=1, max_cpu_loras=8) + llama_7b_multi_lora_from_request_test_harness(max_loras=1, max_cpu_loras=8) def llama_v2_7b_prompt_adapter_test_harness(**llm_kwargs): diff --git a/tests/unittest/llmapi/test_llm_multi_gpu.py b/tests/unittest/llmapi/test_llm_multi_gpu.py index cd6d448342..1b54adfeaf 100644 --- a/tests/unittest/llmapi/test_llm_multi_gpu.py +++ b/tests/unittest/llmapi/test_llm_multi_gpu.py @@ -21,9 +21,10 @@ from .test_llm import ( DummyError, DummyExecutorWorker3, _test_llm_capture_request_error, _test_llm_generate_async, check_llm_return_context_logits, check_llm_return_generation_logits, llm_return_logprobs_test_harness, - default_model_name, get_model_path, llama_7b_multi_lora_test_harness, - llama_model_path, llama_v2_7b_prompt_adapter_test_harness, - llama_v2_13b_lora_test_harness, llm_check_output, + default_model_name, get_model_path, + llama_7b_multi_lora_from_request_test_harness, llama_model_path, + llama_v2_7b_prompt_adapter_test_harness, + llama_v2_13b_lora_from_dir_test_harness, llm_check_output, llm_get_stats_async_test_harness, llm_get_stats_test_harness, llm_test_harness, mixtral_model_name, prompts, test_llm_api_eagle, tinyllama_logits_processor_test_harness, run_llm_with_postprocess_parallel, @@ -253,17 +254,18 @@ def test_tinyllama_logits_processor_tp2pp2(): @pytest.mark.gpu2 @pytest.mark.part3 def test_llama_v2_13b_lora_tp2(): - llama_v2_13b_lora_test_harness(tensor_parallel_size=2, - kv_cache_config=global_kv_cache_config) + llama_v2_13b_lora_from_dir_test_harness( + tensor_parallel_size=2, kv_cache_config=global_kv_cache_config) @pytest.mark.gpu2 @pytest.mark.part3 def test_llama_7b_multi_lora_tp2(): - llama_7b_multi_lora_test_harness(tensor_parallel_size=2, - max_loras=1, - max_cpu_loras=8, - kv_cache_config=global_kv_cache_config) + llama_7b_multi_lora_from_request_test_harness( + tensor_parallel_size=2, + max_loras=1, + max_cpu_loras=8, + kv_cache_config=global_kv_cache_config) @pytest.mark.skip(reason="https://nvbugs/5362426") diff --git a/tests/unittest/llmapi/test_llm_multi_gpu_pytorch.py b/tests/unittest/llmapi/test_llm_multi_gpu_pytorch.py index 8dc1450f33..d1034ccd76 100644 --- a/tests/unittest/llmapi/test_llm_multi_gpu_pytorch.py +++ b/tests/unittest/llmapi/test_llm_multi_gpu_pytorch.py @@ -3,8 +3,8 @@ import pytest # isort: off from .test_llm import tinyllama_logits_processor_test_harness from tensorrt_llm.llmapi import KvCacheConfig -from .test_llm_pytorch import (llama_v2_13b_lora_test_harness, - llama_7b_multi_lora_test_harness) +from .test_llm_pytorch import (llama_7b_lora_from_dir_test_harness, + llama_7b_multi_lora_from_request_test_harness) # isort: on @@ -28,12 +28,12 @@ def test_tinyllama_logits_processor_2gpu(tp_size: int, pp_size: int): @pytest.mark.gpu2 -def test_llama_v2_13b_lora_tp2(): - llama_v2_13b_lora_test_harness(tensor_parallel_size=2, - kv_cache_config=global_kv_cache_config) +def test_llama_7b_lora_tp2(): + llama_7b_lora_from_dir_test_harness(tensor_parallel_size=2, + kv_cache_config=global_kv_cache_config) @pytest.mark.gpu2 def test_llama_7b_multi_lora_tp2(): - llama_7b_multi_lora_test_harness(tensor_parallel_size=2, - kv_cache_config=global_kv_cache_config) + llama_7b_multi_lora_from_request_test_harness( + tensor_parallel_size=2, kv_cache_config=global_kv_cache_config) diff --git a/tests/unittest/llmapi/test_llm_pytorch.py b/tests/unittest/llmapi/test_llm_pytorch.py index bb4db2c6d3..621013c954 100644 --- a/tests/unittest/llmapi/test_llm_pytorch.py +++ b/tests/unittest/llmapi/test_llm_pytorch.py @@ -128,25 +128,23 @@ def test_llm_with_postprocess_parallel_and_result_handler(streaming): tp_size=1) -def llama_v2_13b_lora_test_harness(**llm_kwargs) -> None: - lora_config = LoraConfig(lora_dir=[ - f"{llm_models_root()}/llama-models-v2/chinese-llama-2-lora-13b" - ], - max_lora_rank=64) - llm = LLM(model=f"{llm_models_root()}/llama-models-v2/llama-v2-13b-hf", +def llama_7b_lora_from_dir_test_harness(**llm_kwargs) -> None: + lora_config = LoraConfig( + lora_dir=[f"{llm_models_root()}/llama-models/luotuo-lora-7b-0.1"], + max_lora_rank=8) + llm = LLM(model=f"{llm_models_root()}/llama-models/llama-7b-hf", lora_config=lora_config, **llm_kwargs) prompts = [ - "今天天气很好,我到公园的时候,", + "美国的首都在哪里? \n答案:", ] references = [ - "发现公园里到处都是人,有的在跑步,有的在打羽毛球,还有的", + "美国的首都是华盛顿。\n\n美国的", ] - sampling_params = SamplingParams(max_tokens=20, add_special_tokens=False) + sampling_params = SamplingParams(max_tokens=20) lora_req = LoRARequest( - "task-0", 0, - f"{llm_models_root()}/llama-models-v2/chinese-llama-2-lora-13b") + "task-0", 0, f"{llm_models_root()}/llama-models/luotuo-lora-7b-0.1") lora_request = [lora_req] outputs = llm.generate(prompts, sampling_params, lora_request=lora_request) @@ -154,7 +152,7 @@ def llama_v2_13b_lora_test_harness(**llm_kwargs) -> None: assert similar(outputs[0].outputs[0].text, references[0]) -def llama_7b_multi_lora_test_harness(**llm_kwargs) -> None: +def llama_7b_multi_lora_from_request_test_harness(**llm_kwargs) -> None: hf_model_dir = f"{llm_models_root()}/llama-models/llama-7b-hf" hf_lora_dir1 = f"{llm_models_root()}/llama-models/luotuo-lora-7b-0.1" hf_lora_dir2 = f"{llm_models_root()}/llama-models/Japanese-Alpaca-LoRA-7b-v0" @@ -164,6 +162,7 @@ def llama_7b_multi_lora_test_harness(**llm_kwargs) -> None: # (2) provide a lora_dir to infer the lora_target_modules. lora_config = LoraConfig(lora_target_modules=['attn_q', 'attn_k', 'attn_v'], max_lora_rank=8) + llm = LLM(hf_model_dir, lora_config=lora_config, **llm_kwargs) prompts = [ @@ -194,8 +193,8 @@ def llama_7b_multi_lora_test_harness(**llm_kwargs) -> None: @skip_gpu_memory_less_than_40gb -def test_llama_v2_13b_lora(): - llama_v2_13b_lora_test_harness() +def test_llama_7b_lora(): + llama_7b_lora_from_dir_test_harness() @skip_gpu_memory_less_than_40gb @@ -224,7 +223,7 @@ def test_llama_7b_lora_default_modules() -> None: @skip_gpu_memory_less_than_40gb def test_llama_7b_multi_lora(): - llama_7b_multi_lora_test_harness() + llama_7b_multi_lora_from_request_test_harness() # TODO smor: currently Nemotron-Super-49B-v1 with LoRA memory consumption is overly high diff --git a/tests/unittest/pytest.ini b/tests/unittest/pytest.ini index 9623507c40..80a447c840 100644 --- a/tests/unittest/pytest.ini +++ b/tests/unittest/pytest.ini @@ -18,3 +18,4 @@ markers = part4 gpu2: this test uses 2 GPUs gpu4: this test uses 4 GPUs + post_merge: this test should only run in post merge