[TRTLLM-9551][infra] Partition test_llm_pytorch.py for parallel execution (#10400)

Signed-off-by: Superjomn <328693+Superjomn@users.noreply.github.com>
2026-01-13 22:18:36 +08:00 · 2026-01-06 02:58:03 +08:00 · 2026-01-06 02:58:03 +08:00 · 6b71b03947
commit 6b71b03947
parent ea380ff45c
3 changed files with 36 additions and 2 deletions
--- a/tests/integration/test_lists/test-db/l0_a100.yml
+++ b/tests/integration/test_lists/test-db/l0_a100.yml
@ -13,7 +13,10 @@ l0_a100:
      stage: pre_merge
      backend: "pytorch"
  tests:
-    - unittest/llmapi/test_llm_pytorch.py
+    - unittest/llmapi/test_llm_pytorch.py -m "part0"
+    - unittest/llmapi/test_llm_pytorch.py -m "part1"
+    - unittest/llmapi/test_llm_pytorch.py -m "part2"
+    - unittest/llmapi/test_llm_pytorch.py -m "part3"
    - unittest/llmapi/test_mpi_session.py ISOLATION
    - unittest/llmapi/test_memory_profiling.py::test_profile_kvcache # profile kvcache for vision encoder
    - unittest/llmapi/test_memory_profiling.py::test_pyexecutor_and_kvcache_share_execution_stream # test that PyExecutor and KVCacheManager share the same execution_stream
--- a/tests/integration/test_lists/test-db/l0_h100.yml
+++ b/tests/integration/test_lists/test-db/l0_h100.yml
@ -148,7 +148,10 @@ l0_h100:
    - disaggregated/test_disaggregated_single_gpu.py::test_disaggregated_simple_qwen3[False-False-Qwen3-8B-FP8]
    - unittest/_torch/executor
    - unittest/_torch/ray_orchestrator/single_gpu
-    - unittest/llmapi/test_llm_pytorch.py
+    - unittest/llmapi/test_llm_pytorch.py -m "part0"
+    - unittest/llmapi/test_llm_pytorch.py -m "part1"
+    - unittest/llmapi/test_llm_pytorch.py -m "part2"
+    - unittest/llmapi/test_llm_pytorch.py -m "part3"
    - unittest/llmapi/test_async_llm.py -m "not (gpu2 or gpu4)"
    - examples/test_ray.py::test_llm_inference_async_ray
 - condition:
--- a/tests/unittest/llmapi/test_llm_pytorch.py
+++ b/tests/unittest/llmapi/test_llm_pytorch.py
@ -48,6 +48,7 @@ from transformers import AutoModelForCausalLM, AutoTokenizer

@force_ampere
@pytest.mark.parametrize("enable_chunked_prefill,", [False, True])
+@pytest.mark.part2
 def test_tinyllama_logits_processor(enable_chunked_prefill):
    tinyllama_logits_processor_test_harness(
        backend="pytorch", enable_chunked_prefill=enable_chunked_prefill)
@ -62,6 +63,7 @@ def test_tinyllama_logits_processor(enable_chunked_prefill):
        (False, True, False, True),
        (False, True, True, True),
    ])
+@pytest.mark.part0
 def test_llm_get_stats(return_context_logits, use_overlap,
                       enable_chunked_prefill, enable_iter_req_stats):
    llm_get_stats_test_harness(tp_size=1,
@ -82,6 +84,7 @@ def test_llm_get_stats(return_context_logits, use_overlap,
        (False, True, False, True),
        (False, True, True, True),
    ])
+@pytest.mark.part1
 def test_llm_get_stats_async(return_context_logits, use_overlap,
                             enable_chunked_prefill, enable_iter_req_stats):
    llm_get_stats_async_test_harness(
@ -94,6 +97,7 @@ def test_llm_get_stats_async(return_context_logits, use_overlap,
        enable_iter_req_stats=enable_iter_req_stats)


+@pytest.mark.part1
 def test_llm_capture_request_error():
    _test_llm_capture_request_error(pytorch_backend=True, tp_size=1)

@ -105,6 +109,7 @@ def test_llm_capture_request_error():
    [
        SamplingParams()  # pytorch only supports n=1
    ])
+@pytest.mark.part0
 def test_llm_abort_request(sampling_params):
    llm = LLM(model=llama_model_path, kv_cache_config=global_kvcache_config)
    run_llm_abort_request(llm=llm, sampling_params=sampling_params)
@ -118,6 +123,7 @@ def _validate_invalid_token_error_scope():


@force_ampere
+@pytest.mark.part1
 def test_llm_invalid_input_token():
    llm = LLM(model=llama_model_path, kv_cache_config=global_kvcache_config)
    prompts = [
@ -136,6 +142,7 @@ def test_llm_invalid_input_token():


@force_ampere
+@pytest.mark.part0
 def test_llm_invalid_input_token_async():
    llm = LLM(model=llama_model_path, kv_cache_config=global_kvcache_config)
    # NB: exc_info in _validate_invalid_token_error_scope creates a reference
@ -167,6 +174,7 @@ def test_llm_invalid_input_token_async():
                        futures[collect_idx].result()


+@pytest.mark.part2
 def test_llm_reward_model():
    rm_model_path = get_model_path("Qwen2.5-Math-PRM-7B")
    tokenizer = TransformersTokenizer.from_pretrained(rm_model_path)
@ -188,6 +196,7 @@ def test_llm_reward_model():


@skip_ray
+@pytest.mark.part3
 def test_llm_perf_metrics():
    with LLM(model=llama_model_path,
             kv_cache_config=global_kvcache_config) as llm:
@ -216,6 +225,7 @@ def test_llm_perf_metrics():


@skip_ray
+@pytest.mark.part3
 def test_llm_prometheus():
    test_prompts = [
        "Hello, my name is",
@ -239,6 +249,7 @@ def test_llm_prometheus():

@skip_ray
@pytest.mark.parametrize("streaming", [True, False])
+@pytest.mark.part3
 def test_llm_with_postprocess_parallel_and_result_handler(streaming):
    run_llm_with_postprocess_parallel_and_result_handler(streaming,
                                                         "pytorch",
@ -306,6 +317,7 @@ def llama_7b_lora_from_dir_test_harness(**llm_kwargs) -> None:


@skip_gpu_memory_less_than_40gb
+@pytest.mark.part0
 def test_llama_7b_lora():
    llama_7b_lora_from_dir_test_harness()

@ -368,6 +380,7 @@ def _check_llama_7b_multi_lora_evict_load_new_adapters(

@skip_gpu_memory_less_than_40gb
@skip_ray  # https://nvbugs/5682551
+@pytest.mark.part3
 def test_llama_7b_multi_lora_evict_and_reload_lora_gpu_cache():
    """Test eviction and re-loading a previously evicted adapter from the LoRA GPU cache, within a single
    llm.generate call, that's repeated twice.
@ -381,6 +394,7 @@ def test_llama_7b_multi_lora_evict_and_reload_lora_gpu_cache():


@skip_gpu_memory_less_than_40gb
+@pytest.mark.part1
 def test_llama_7b_multi_lora_evict_and_load_new_adapters_in_cpu_and_gpu_cache():
    """Test eviction and loading of new adapters in the evicted space, over several llm.generate calls, with LoRA GPU
    cache size < LoRA CPU cache size.
@ -394,6 +408,7 @@ def test_llama_7b_multi_lora_evict_and_load_new_adapters_in_cpu_and_gpu_cache():


@skip_gpu_memory_less_than_40gb
+@pytest.mark.part0
 def test_llama_7b_multi_lora_read_from_cache_after_insert():
    """Test that loading and then using the same adapters loaded in cache works."""
    _check_llama_7b_multi_lora_evict_load_new_adapters(
@ -405,6 +420,7 @@ def test_llama_7b_multi_lora_read_from_cache_after_insert():


@skip_gpu_memory_less_than_40gb
+@pytest.mark.part3
 def test_llama_7b_multi_lora_evict_and_reload_evicted_adapters_in_cpu_and_gpu_cache(
 ):
    """Test eviction, reloading new adapters and reloading previously evicted adapters from the LoRA CPU cache & GPU
@ -427,6 +443,7 @@ def test_llama_7b_multi_lora_evict_and_reload_evicted_adapters_in_cpu_and_gpu_ca


@skip_gpu_memory_less_than_40gb
+@pytest.mark.part2
 def test_llama_7b_peft_cache_config_affects_peft_cache_size():
    """Tests that LLM arg of peft_cache_config affects the peft cache sizes.

@ -464,6 +481,7 @@ def test_llama_7b_peft_cache_config_affects_peft_cache_size():

@skip_ray  # https://nvbugs/5682551
@skip_gpu_memory_less_than_40gb
+@pytest.mark.part1
 def test_llama_7b_lora_config_overrides_peft_cache_config():
    """Tests that cache size args in lora_config LLM arg override the cache size
    parameters in peft_cache_config LLM arg.
@ -487,6 +505,7 @@ def test_llama_7b_lora_config_overrides_peft_cache_config():
 # https://jirasw.nvidia.com/browse/TRTLLM-5045
@pytest.mark.skip(reason="https://nvbugs/5448464")
@skip_gpu_memory_less_than_138gb
+@pytest.mark.part1
 def test_nemotron_nas_lora() -> None:
    lora_config = LoraConfig(lora_dir=[
        f"{llm_models_root()}/nemotron-nas/Llama-3_3-Nemotron-Super-49B-v1-lora-adapter_r64"
@ -519,6 +538,7 @@ def test_nemotron_nas_lora() -> None:


@skip_gpu_memory_less_than_80gb
+@pytest.mark.part0
 def test_llama_3_1_8b_fp8_with_bf16_lora() -> None:
    skip_fp8_pre_ada(use_fp8=True)
    model_dir = f"{llm_models_root()}/llama-3.1-model/Llama-3.1-8B-Instruct-FP8"
@ -549,6 +569,7 @@ def test_llama_3_1_8b_fp8_with_bf16_lora() -> None:


@skip_gpu_memory_less_than_80gb
+@pytest.mark.part2
 def test_bielik_11b_v2_2_instruct_multi_lora() -> None:
    model_dir = f"{llm_models_root()}/Bielik-11B-v2.2-Instruct"

@ -602,6 +623,7 @@ def test_bielik_11b_v2_2_instruct_multi_lora() -> None:
        assert len(outputs) == 2


+@pytest.mark.part2
 def test_gemma3_1b_instruct_multi_lora() -> None:
    model_dir = f"{llm_models_root()}/gemma/gemma-3-1b-it"

@ -666,6 +688,7 @@ def test_gemma3_1b_instruct_multi_lora() -> None:
        (16, 16, "rank_16"),
        (4, 8, "rank_4_max_8"),
    ])
+@pytest.mark.part3
 def test_load_torch_nemo_lora_function(tmp_path, lora_rank, max_lora_rank,
                                       description):
    """Test load_torch_nemo_lora function with different LoRA rank configurations."""
@ -695,6 +718,7 @@ def test_load_torch_nemo_lora_function(tmp_path, lora_rank, max_lora_rank,
    }, f"Expected correct module mapping for {description}"


+@pytest.mark.part0
 def test_nemo_lora_unsupported_modules_validation(tmp_path):
    """Test validation of unsupported modules in NeMo LoRA."""
    from tensorrt_llm.lora_manager import load_torch_nemo_lora
@ -720,6 +744,7 @@ def test_nemo_lora_unsupported_modules_validation(tmp_path):


@force_ampere
+@pytest.mark.part1
 def test_gqa_nemo_lora(tmp_path):
    """
    Test NeMo-format LoRA checkpoint loading and GQA support in TinyLlama.
@ -798,6 +823,7 @@ def test_gqa_nemo_lora(tmp_path):

 class TestLlmError:

+    @pytest.mark.part3
    def test_max_num_token_check(self):
        """ LLM should raise error when got prompt length exceed the valid range. """
        llm = LLM(llama_model_path,
@ -828,6 +854,7 @@ FailingExecutor = type(


@skip_ray
+@pytest.mark.part2
 def test_llm_with_proxy_error():
    """Test that LLM properly handles GenerationExecutorWorker constructor failures.

@ -928,6 +955,7 @@ def test_llm_return_logprobs_streaming(prompt_logprobs, logprobs,

 class TestLlmError:

+    @pytest.mark.part3
    def test_max_num_token_check(self):
        """ LLM should raise error when got prompt length exceed the valid range. """
        llm = LLM(llama_model_path,