[TRTLLM-9551][infra] Partition test_llm_pytorch.py for parallel execution (#10400)

Signed-off-by: Superjomn <328693+Superjomn@users.noreply.github.com>
This commit is contained in:
Yan Chunwei 2026-01-06 02:58:03 +08:00 committed by GitHub
parent ea380ff45c
commit 6b71b03947
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 36 additions and 2 deletions

View File

@ -13,7 +13,10 @@ l0_a100:
stage: pre_merge
backend: "pytorch"
tests:
- unittest/llmapi/test_llm_pytorch.py
- unittest/llmapi/test_llm_pytorch.py -m "part0"
- unittest/llmapi/test_llm_pytorch.py -m "part1"
- unittest/llmapi/test_llm_pytorch.py -m "part2"
- unittest/llmapi/test_llm_pytorch.py -m "part3"
- unittest/llmapi/test_mpi_session.py ISOLATION
- unittest/llmapi/test_memory_profiling.py::test_profile_kvcache # profile kvcache for vision encoder
- unittest/llmapi/test_memory_profiling.py::test_pyexecutor_and_kvcache_share_execution_stream # test that PyExecutor and KVCacheManager share the same execution_stream

View File

@ -148,7 +148,10 @@ l0_h100:
- disaggregated/test_disaggregated_single_gpu.py::test_disaggregated_simple_qwen3[False-False-Qwen3-8B-FP8]
- unittest/_torch/executor
- unittest/_torch/ray_orchestrator/single_gpu
- unittest/llmapi/test_llm_pytorch.py
- unittest/llmapi/test_llm_pytorch.py -m "part0"
- unittest/llmapi/test_llm_pytorch.py -m "part1"
- unittest/llmapi/test_llm_pytorch.py -m "part2"
- unittest/llmapi/test_llm_pytorch.py -m "part3"
- unittest/llmapi/test_async_llm.py -m "not (gpu2 or gpu4)"
- examples/test_ray.py::test_llm_inference_async_ray
- condition:

View File

@ -48,6 +48,7 @@ from transformers import AutoModelForCausalLM, AutoTokenizer
@force_ampere
@pytest.mark.parametrize("enable_chunked_prefill,", [False, True])
@pytest.mark.part2
def test_tinyllama_logits_processor(enable_chunked_prefill):
tinyllama_logits_processor_test_harness(
backend="pytorch", enable_chunked_prefill=enable_chunked_prefill)
@ -62,6 +63,7 @@ def test_tinyllama_logits_processor(enable_chunked_prefill):
(False, True, False, True),
(False, True, True, True),
])
@pytest.mark.part0
def test_llm_get_stats(return_context_logits, use_overlap,
enable_chunked_prefill, enable_iter_req_stats):
llm_get_stats_test_harness(tp_size=1,
@ -82,6 +84,7 @@ def test_llm_get_stats(return_context_logits, use_overlap,
(False, True, False, True),
(False, True, True, True),
])
@pytest.mark.part1
def test_llm_get_stats_async(return_context_logits, use_overlap,
enable_chunked_prefill, enable_iter_req_stats):
llm_get_stats_async_test_harness(
@ -94,6 +97,7 @@ def test_llm_get_stats_async(return_context_logits, use_overlap,
enable_iter_req_stats=enable_iter_req_stats)
@pytest.mark.part1
def test_llm_capture_request_error():
_test_llm_capture_request_error(pytorch_backend=True, tp_size=1)
@ -105,6 +109,7 @@ def test_llm_capture_request_error():
[
SamplingParams() # pytorch only supports n=1
])
@pytest.mark.part0
def test_llm_abort_request(sampling_params):
llm = LLM(model=llama_model_path, kv_cache_config=global_kvcache_config)
run_llm_abort_request(llm=llm, sampling_params=sampling_params)
@ -118,6 +123,7 @@ def _validate_invalid_token_error_scope():
@force_ampere
@pytest.mark.part1
def test_llm_invalid_input_token():
llm = LLM(model=llama_model_path, kv_cache_config=global_kvcache_config)
prompts = [
@ -136,6 +142,7 @@ def test_llm_invalid_input_token():
@force_ampere
@pytest.mark.part0
def test_llm_invalid_input_token_async():
llm = LLM(model=llama_model_path, kv_cache_config=global_kvcache_config)
# NB: exc_info in _validate_invalid_token_error_scope creates a reference
@ -167,6 +174,7 @@ def test_llm_invalid_input_token_async():
futures[collect_idx].result()
@pytest.mark.part2
def test_llm_reward_model():
rm_model_path = get_model_path("Qwen2.5-Math-PRM-7B")
tokenizer = TransformersTokenizer.from_pretrained(rm_model_path)
@ -188,6 +196,7 @@ def test_llm_reward_model():
@skip_ray
@pytest.mark.part3
def test_llm_perf_metrics():
with LLM(model=llama_model_path,
kv_cache_config=global_kvcache_config) as llm:
@ -216,6 +225,7 @@ def test_llm_perf_metrics():
@skip_ray
@pytest.mark.part3
def test_llm_prometheus():
test_prompts = [
"Hello, my name is",
@ -239,6 +249,7 @@ def test_llm_prometheus():
@skip_ray
@pytest.mark.parametrize("streaming", [True, False])
@pytest.mark.part3
def test_llm_with_postprocess_parallel_and_result_handler(streaming):
run_llm_with_postprocess_parallel_and_result_handler(streaming,
"pytorch",
@ -306,6 +317,7 @@ def llama_7b_lora_from_dir_test_harness(**llm_kwargs) -> None:
@skip_gpu_memory_less_than_40gb
@pytest.mark.part0
def test_llama_7b_lora():
llama_7b_lora_from_dir_test_harness()
@ -368,6 +380,7 @@ def _check_llama_7b_multi_lora_evict_load_new_adapters(
@skip_gpu_memory_less_than_40gb
@skip_ray # https://nvbugs/5682551
@pytest.mark.part3
def test_llama_7b_multi_lora_evict_and_reload_lora_gpu_cache():
"""Test eviction and re-loading a previously evicted adapter from the LoRA GPU cache, within a single
llm.generate call, that's repeated twice.
@ -381,6 +394,7 @@ def test_llama_7b_multi_lora_evict_and_reload_lora_gpu_cache():
@skip_gpu_memory_less_than_40gb
@pytest.mark.part1
def test_llama_7b_multi_lora_evict_and_load_new_adapters_in_cpu_and_gpu_cache():
"""Test eviction and loading of new adapters in the evicted space, over several llm.generate calls, with LoRA GPU
cache size < LoRA CPU cache size.
@ -394,6 +408,7 @@ def test_llama_7b_multi_lora_evict_and_load_new_adapters_in_cpu_and_gpu_cache():
@skip_gpu_memory_less_than_40gb
@pytest.mark.part0
def test_llama_7b_multi_lora_read_from_cache_after_insert():
"""Test that loading and then using the same adapters loaded in cache works."""
_check_llama_7b_multi_lora_evict_load_new_adapters(
@ -405,6 +420,7 @@ def test_llama_7b_multi_lora_read_from_cache_after_insert():
@skip_gpu_memory_less_than_40gb
@pytest.mark.part3
def test_llama_7b_multi_lora_evict_and_reload_evicted_adapters_in_cpu_and_gpu_cache(
):
"""Test eviction, reloading new adapters and reloading previously evicted adapters from the LoRA CPU cache & GPU
@ -427,6 +443,7 @@ def test_llama_7b_multi_lora_evict_and_reload_evicted_adapters_in_cpu_and_gpu_ca
@skip_gpu_memory_less_than_40gb
@pytest.mark.part2
def test_llama_7b_peft_cache_config_affects_peft_cache_size():
"""Tests that LLM arg of peft_cache_config affects the peft cache sizes.
@ -464,6 +481,7 @@ def test_llama_7b_peft_cache_config_affects_peft_cache_size():
@skip_ray # https://nvbugs/5682551
@skip_gpu_memory_less_than_40gb
@pytest.mark.part1
def test_llama_7b_lora_config_overrides_peft_cache_config():
"""Tests that cache size args in lora_config LLM arg override the cache size
parameters in peft_cache_config LLM arg.
@ -487,6 +505,7 @@ def test_llama_7b_lora_config_overrides_peft_cache_config():
# https://jirasw.nvidia.com/browse/TRTLLM-5045
@pytest.mark.skip(reason="https://nvbugs/5448464")
@skip_gpu_memory_less_than_138gb
@pytest.mark.part1
def test_nemotron_nas_lora() -> None:
lora_config = LoraConfig(lora_dir=[
f"{llm_models_root()}/nemotron-nas/Llama-3_3-Nemotron-Super-49B-v1-lora-adapter_r64"
@ -519,6 +538,7 @@ def test_nemotron_nas_lora() -> None:
@skip_gpu_memory_less_than_80gb
@pytest.mark.part0
def test_llama_3_1_8b_fp8_with_bf16_lora() -> None:
skip_fp8_pre_ada(use_fp8=True)
model_dir = f"{llm_models_root()}/llama-3.1-model/Llama-3.1-8B-Instruct-FP8"
@ -549,6 +569,7 @@ def test_llama_3_1_8b_fp8_with_bf16_lora() -> None:
@skip_gpu_memory_less_than_80gb
@pytest.mark.part2
def test_bielik_11b_v2_2_instruct_multi_lora() -> None:
model_dir = f"{llm_models_root()}/Bielik-11B-v2.2-Instruct"
@ -602,6 +623,7 @@ def test_bielik_11b_v2_2_instruct_multi_lora() -> None:
assert len(outputs) == 2
@pytest.mark.part2
def test_gemma3_1b_instruct_multi_lora() -> None:
model_dir = f"{llm_models_root()}/gemma/gemma-3-1b-it"
@ -666,6 +688,7 @@ def test_gemma3_1b_instruct_multi_lora() -> None:
(16, 16, "rank_16"),
(4, 8, "rank_4_max_8"),
])
@pytest.mark.part3
def test_load_torch_nemo_lora_function(tmp_path, lora_rank, max_lora_rank,
description):
"""Test load_torch_nemo_lora function with different LoRA rank configurations."""
@ -695,6 +718,7 @@ def test_load_torch_nemo_lora_function(tmp_path, lora_rank, max_lora_rank,
}, f"Expected correct module mapping for {description}"
@pytest.mark.part0
def test_nemo_lora_unsupported_modules_validation(tmp_path):
"""Test validation of unsupported modules in NeMo LoRA."""
from tensorrt_llm.lora_manager import load_torch_nemo_lora
@ -720,6 +744,7 @@ def test_nemo_lora_unsupported_modules_validation(tmp_path):
@force_ampere
@pytest.mark.part1
def test_gqa_nemo_lora(tmp_path):
"""
Test NeMo-format LoRA checkpoint loading and GQA support in TinyLlama.
@ -798,6 +823,7 @@ def test_gqa_nemo_lora(tmp_path):
class TestLlmError:
@pytest.mark.part3
def test_max_num_token_check(self):
""" LLM should raise error when got prompt length exceed the valid range. """
llm = LLM(llama_model_path,
@ -828,6 +854,7 @@ FailingExecutor = type(
@skip_ray
@pytest.mark.part2
def test_llm_with_proxy_error():
"""Test that LLM properly handles GenerationExecutorWorker constructor failures.
@ -928,6 +955,7 @@ def test_llm_return_logprobs_streaming(prompt_logprobs, logprobs,
class TestLlmError:
@pytest.mark.part3
def test_max_num_token_check(self):
""" LLM should raise error when got prompt length exceed the valid range. """
llm = LLM(llama_model_path,