[TRTLLM-9551][infra] Partition test_llm_pytorch.py for parallel execution (#10400)

Signed-off-by: Superjomn <328693+Superjomn@users.noreply.github.com>
This commit is contained in:
Yan Chunwei 2026-01-06 02:58:03 +08:00 committed by GitHub
parent ea380ff45c
commit 6b71b03947
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 36 additions and 2 deletions

View File

@ -13,7 +13,10 @@ l0_a100:
stage: pre_merge stage: pre_merge
backend: "pytorch" backend: "pytorch"
tests: tests:
- unittest/llmapi/test_llm_pytorch.py - unittest/llmapi/test_llm_pytorch.py -m "part0"
- unittest/llmapi/test_llm_pytorch.py -m "part1"
- unittest/llmapi/test_llm_pytorch.py -m "part2"
- unittest/llmapi/test_llm_pytorch.py -m "part3"
- unittest/llmapi/test_mpi_session.py ISOLATION - unittest/llmapi/test_mpi_session.py ISOLATION
- unittest/llmapi/test_memory_profiling.py::test_profile_kvcache # profile kvcache for vision encoder - unittest/llmapi/test_memory_profiling.py::test_profile_kvcache # profile kvcache for vision encoder
- unittest/llmapi/test_memory_profiling.py::test_pyexecutor_and_kvcache_share_execution_stream # test that PyExecutor and KVCacheManager share the same execution_stream - unittest/llmapi/test_memory_profiling.py::test_pyexecutor_and_kvcache_share_execution_stream # test that PyExecutor and KVCacheManager share the same execution_stream

View File

@ -148,7 +148,10 @@ l0_h100:
- disaggregated/test_disaggregated_single_gpu.py::test_disaggregated_simple_qwen3[False-False-Qwen3-8B-FP8] - disaggregated/test_disaggregated_single_gpu.py::test_disaggregated_simple_qwen3[False-False-Qwen3-8B-FP8]
- unittest/_torch/executor - unittest/_torch/executor
- unittest/_torch/ray_orchestrator/single_gpu - unittest/_torch/ray_orchestrator/single_gpu
- unittest/llmapi/test_llm_pytorch.py - unittest/llmapi/test_llm_pytorch.py -m "part0"
- unittest/llmapi/test_llm_pytorch.py -m "part1"
- unittest/llmapi/test_llm_pytorch.py -m "part2"
- unittest/llmapi/test_llm_pytorch.py -m "part3"
- unittest/llmapi/test_async_llm.py -m "not (gpu2 or gpu4)" - unittest/llmapi/test_async_llm.py -m "not (gpu2 or gpu4)"
- examples/test_ray.py::test_llm_inference_async_ray - examples/test_ray.py::test_llm_inference_async_ray
- condition: - condition:

View File

@ -48,6 +48,7 @@ from transformers import AutoModelForCausalLM, AutoTokenizer
@force_ampere @force_ampere
@pytest.mark.parametrize("enable_chunked_prefill,", [False, True]) @pytest.mark.parametrize("enable_chunked_prefill,", [False, True])
@pytest.mark.part2
def test_tinyllama_logits_processor(enable_chunked_prefill): def test_tinyllama_logits_processor(enable_chunked_prefill):
tinyllama_logits_processor_test_harness( tinyllama_logits_processor_test_harness(
backend="pytorch", enable_chunked_prefill=enable_chunked_prefill) backend="pytorch", enable_chunked_prefill=enable_chunked_prefill)
@ -62,6 +63,7 @@ def test_tinyllama_logits_processor(enable_chunked_prefill):
(False, True, False, True), (False, True, False, True),
(False, True, True, True), (False, True, True, True),
]) ])
@pytest.mark.part0
def test_llm_get_stats(return_context_logits, use_overlap, def test_llm_get_stats(return_context_logits, use_overlap,
enable_chunked_prefill, enable_iter_req_stats): enable_chunked_prefill, enable_iter_req_stats):
llm_get_stats_test_harness(tp_size=1, llm_get_stats_test_harness(tp_size=1,
@ -82,6 +84,7 @@ def test_llm_get_stats(return_context_logits, use_overlap,
(False, True, False, True), (False, True, False, True),
(False, True, True, True), (False, True, True, True),
]) ])
@pytest.mark.part1
def test_llm_get_stats_async(return_context_logits, use_overlap, def test_llm_get_stats_async(return_context_logits, use_overlap,
enable_chunked_prefill, enable_iter_req_stats): enable_chunked_prefill, enable_iter_req_stats):
llm_get_stats_async_test_harness( llm_get_stats_async_test_harness(
@ -94,6 +97,7 @@ def test_llm_get_stats_async(return_context_logits, use_overlap,
enable_iter_req_stats=enable_iter_req_stats) enable_iter_req_stats=enable_iter_req_stats)
@pytest.mark.part1
def test_llm_capture_request_error(): def test_llm_capture_request_error():
_test_llm_capture_request_error(pytorch_backend=True, tp_size=1) _test_llm_capture_request_error(pytorch_backend=True, tp_size=1)
@ -105,6 +109,7 @@ def test_llm_capture_request_error():
[ [
SamplingParams() # pytorch only supports n=1 SamplingParams() # pytorch only supports n=1
]) ])
@pytest.mark.part0
def test_llm_abort_request(sampling_params): def test_llm_abort_request(sampling_params):
llm = LLM(model=llama_model_path, kv_cache_config=global_kvcache_config) llm = LLM(model=llama_model_path, kv_cache_config=global_kvcache_config)
run_llm_abort_request(llm=llm, sampling_params=sampling_params) run_llm_abort_request(llm=llm, sampling_params=sampling_params)
@ -118,6 +123,7 @@ def _validate_invalid_token_error_scope():
@force_ampere @force_ampere
@pytest.mark.part1
def test_llm_invalid_input_token(): def test_llm_invalid_input_token():
llm = LLM(model=llama_model_path, kv_cache_config=global_kvcache_config) llm = LLM(model=llama_model_path, kv_cache_config=global_kvcache_config)
prompts = [ prompts = [
@ -136,6 +142,7 @@ def test_llm_invalid_input_token():
@force_ampere @force_ampere
@pytest.mark.part0
def test_llm_invalid_input_token_async(): def test_llm_invalid_input_token_async():
llm = LLM(model=llama_model_path, kv_cache_config=global_kvcache_config) llm = LLM(model=llama_model_path, kv_cache_config=global_kvcache_config)
# NB: exc_info in _validate_invalid_token_error_scope creates a reference # NB: exc_info in _validate_invalid_token_error_scope creates a reference
@ -167,6 +174,7 @@ def test_llm_invalid_input_token_async():
futures[collect_idx].result() futures[collect_idx].result()
@pytest.mark.part2
def test_llm_reward_model(): def test_llm_reward_model():
rm_model_path = get_model_path("Qwen2.5-Math-PRM-7B") rm_model_path = get_model_path("Qwen2.5-Math-PRM-7B")
tokenizer = TransformersTokenizer.from_pretrained(rm_model_path) tokenizer = TransformersTokenizer.from_pretrained(rm_model_path)
@ -188,6 +196,7 @@ def test_llm_reward_model():
@skip_ray @skip_ray
@pytest.mark.part3
def test_llm_perf_metrics(): def test_llm_perf_metrics():
with LLM(model=llama_model_path, with LLM(model=llama_model_path,
kv_cache_config=global_kvcache_config) as llm: kv_cache_config=global_kvcache_config) as llm:
@ -216,6 +225,7 @@ def test_llm_perf_metrics():
@skip_ray @skip_ray
@pytest.mark.part3
def test_llm_prometheus(): def test_llm_prometheus():
test_prompts = [ test_prompts = [
"Hello, my name is", "Hello, my name is",
@ -239,6 +249,7 @@ def test_llm_prometheus():
@skip_ray @skip_ray
@pytest.mark.parametrize("streaming", [True, False]) @pytest.mark.parametrize("streaming", [True, False])
@pytest.mark.part3
def test_llm_with_postprocess_parallel_and_result_handler(streaming): def test_llm_with_postprocess_parallel_and_result_handler(streaming):
run_llm_with_postprocess_parallel_and_result_handler(streaming, run_llm_with_postprocess_parallel_and_result_handler(streaming,
"pytorch", "pytorch",
@ -306,6 +317,7 @@ def llama_7b_lora_from_dir_test_harness(**llm_kwargs) -> None:
@skip_gpu_memory_less_than_40gb @skip_gpu_memory_less_than_40gb
@pytest.mark.part0
def test_llama_7b_lora(): def test_llama_7b_lora():
llama_7b_lora_from_dir_test_harness() llama_7b_lora_from_dir_test_harness()
@ -368,6 +380,7 @@ def _check_llama_7b_multi_lora_evict_load_new_adapters(
@skip_gpu_memory_less_than_40gb @skip_gpu_memory_less_than_40gb
@skip_ray # https://nvbugs/5682551 @skip_ray # https://nvbugs/5682551
@pytest.mark.part3
def test_llama_7b_multi_lora_evict_and_reload_lora_gpu_cache(): def test_llama_7b_multi_lora_evict_and_reload_lora_gpu_cache():
"""Test eviction and re-loading a previously evicted adapter from the LoRA GPU cache, within a single """Test eviction and re-loading a previously evicted adapter from the LoRA GPU cache, within a single
llm.generate call, that's repeated twice. llm.generate call, that's repeated twice.
@ -381,6 +394,7 @@ def test_llama_7b_multi_lora_evict_and_reload_lora_gpu_cache():
@skip_gpu_memory_less_than_40gb @skip_gpu_memory_less_than_40gb
@pytest.mark.part1
def test_llama_7b_multi_lora_evict_and_load_new_adapters_in_cpu_and_gpu_cache(): def test_llama_7b_multi_lora_evict_and_load_new_adapters_in_cpu_and_gpu_cache():
"""Test eviction and loading of new adapters in the evicted space, over several llm.generate calls, with LoRA GPU """Test eviction and loading of new adapters in the evicted space, over several llm.generate calls, with LoRA GPU
cache size < LoRA CPU cache size. cache size < LoRA CPU cache size.
@ -394,6 +408,7 @@ def test_llama_7b_multi_lora_evict_and_load_new_adapters_in_cpu_and_gpu_cache():
@skip_gpu_memory_less_than_40gb @skip_gpu_memory_less_than_40gb
@pytest.mark.part0
def test_llama_7b_multi_lora_read_from_cache_after_insert(): def test_llama_7b_multi_lora_read_from_cache_after_insert():
"""Test that loading and then using the same adapters loaded in cache works.""" """Test that loading and then using the same adapters loaded in cache works."""
_check_llama_7b_multi_lora_evict_load_new_adapters( _check_llama_7b_multi_lora_evict_load_new_adapters(
@ -405,6 +420,7 @@ def test_llama_7b_multi_lora_read_from_cache_after_insert():
@skip_gpu_memory_less_than_40gb @skip_gpu_memory_less_than_40gb
@pytest.mark.part3
def test_llama_7b_multi_lora_evict_and_reload_evicted_adapters_in_cpu_and_gpu_cache( def test_llama_7b_multi_lora_evict_and_reload_evicted_adapters_in_cpu_and_gpu_cache(
): ):
"""Test eviction, reloading new adapters and reloading previously evicted adapters from the LoRA CPU cache & GPU """Test eviction, reloading new adapters and reloading previously evicted adapters from the LoRA CPU cache & GPU
@ -427,6 +443,7 @@ def test_llama_7b_multi_lora_evict_and_reload_evicted_adapters_in_cpu_and_gpu_ca
@skip_gpu_memory_less_than_40gb @skip_gpu_memory_less_than_40gb
@pytest.mark.part2
def test_llama_7b_peft_cache_config_affects_peft_cache_size(): def test_llama_7b_peft_cache_config_affects_peft_cache_size():
"""Tests that LLM arg of peft_cache_config affects the peft cache sizes. """Tests that LLM arg of peft_cache_config affects the peft cache sizes.
@ -464,6 +481,7 @@ def test_llama_7b_peft_cache_config_affects_peft_cache_size():
@skip_ray # https://nvbugs/5682551 @skip_ray # https://nvbugs/5682551
@skip_gpu_memory_less_than_40gb @skip_gpu_memory_less_than_40gb
@pytest.mark.part1
def test_llama_7b_lora_config_overrides_peft_cache_config(): def test_llama_7b_lora_config_overrides_peft_cache_config():
"""Tests that cache size args in lora_config LLM arg override the cache size """Tests that cache size args in lora_config LLM arg override the cache size
parameters in peft_cache_config LLM arg. parameters in peft_cache_config LLM arg.
@ -487,6 +505,7 @@ def test_llama_7b_lora_config_overrides_peft_cache_config():
# https://jirasw.nvidia.com/browse/TRTLLM-5045 # https://jirasw.nvidia.com/browse/TRTLLM-5045
@pytest.mark.skip(reason="https://nvbugs/5448464") @pytest.mark.skip(reason="https://nvbugs/5448464")
@skip_gpu_memory_less_than_138gb @skip_gpu_memory_less_than_138gb
@pytest.mark.part1
def test_nemotron_nas_lora() -> None: def test_nemotron_nas_lora() -> None:
lora_config = LoraConfig(lora_dir=[ lora_config = LoraConfig(lora_dir=[
f"{llm_models_root()}/nemotron-nas/Llama-3_3-Nemotron-Super-49B-v1-lora-adapter_r64" f"{llm_models_root()}/nemotron-nas/Llama-3_3-Nemotron-Super-49B-v1-lora-adapter_r64"
@ -519,6 +538,7 @@ def test_nemotron_nas_lora() -> None:
@skip_gpu_memory_less_than_80gb @skip_gpu_memory_less_than_80gb
@pytest.mark.part0
def test_llama_3_1_8b_fp8_with_bf16_lora() -> None: def test_llama_3_1_8b_fp8_with_bf16_lora() -> None:
skip_fp8_pre_ada(use_fp8=True) skip_fp8_pre_ada(use_fp8=True)
model_dir = f"{llm_models_root()}/llama-3.1-model/Llama-3.1-8B-Instruct-FP8" model_dir = f"{llm_models_root()}/llama-3.1-model/Llama-3.1-8B-Instruct-FP8"
@ -549,6 +569,7 @@ def test_llama_3_1_8b_fp8_with_bf16_lora() -> None:
@skip_gpu_memory_less_than_80gb @skip_gpu_memory_less_than_80gb
@pytest.mark.part2
def test_bielik_11b_v2_2_instruct_multi_lora() -> None: def test_bielik_11b_v2_2_instruct_multi_lora() -> None:
model_dir = f"{llm_models_root()}/Bielik-11B-v2.2-Instruct" model_dir = f"{llm_models_root()}/Bielik-11B-v2.2-Instruct"
@ -602,6 +623,7 @@ def test_bielik_11b_v2_2_instruct_multi_lora() -> None:
assert len(outputs) == 2 assert len(outputs) == 2
@pytest.mark.part2
def test_gemma3_1b_instruct_multi_lora() -> None: def test_gemma3_1b_instruct_multi_lora() -> None:
model_dir = f"{llm_models_root()}/gemma/gemma-3-1b-it" model_dir = f"{llm_models_root()}/gemma/gemma-3-1b-it"
@ -666,6 +688,7 @@ def test_gemma3_1b_instruct_multi_lora() -> None:
(16, 16, "rank_16"), (16, 16, "rank_16"),
(4, 8, "rank_4_max_8"), (4, 8, "rank_4_max_8"),
]) ])
@pytest.mark.part3
def test_load_torch_nemo_lora_function(tmp_path, lora_rank, max_lora_rank, def test_load_torch_nemo_lora_function(tmp_path, lora_rank, max_lora_rank,
description): description):
"""Test load_torch_nemo_lora function with different LoRA rank configurations.""" """Test load_torch_nemo_lora function with different LoRA rank configurations."""
@ -695,6 +718,7 @@ def test_load_torch_nemo_lora_function(tmp_path, lora_rank, max_lora_rank,
}, f"Expected correct module mapping for {description}" }, f"Expected correct module mapping for {description}"
@pytest.mark.part0
def test_nemo_lora_unsupported_modules_validation(tmp_path): def test_nemo_lora_unsupported_modules_validation(tmp_path):
"""Test validation of unsupported modules in NeMo LoRA.""" """Test validation of unsupported modules in NeMo LoRA."""
from tensorrt_llm.lora_manager import load_torch_nemo_lora from tensorrt_llm.lora_manager import load_torch_nemo_lora
@ -720,6 +744,7 @@ def test_nemo_lora_unsupported_modules_validation(tmp_path):
@force_ampere @force_ampere
@pytest.mark.part1
def test_gqa_nemo_lora(tmp_path): def test_gqa_nemo_lora(tmp_path):
""" """
Test NeMo-format LoRA checkpoint loading and GQA support in TinyLlama. Test NeMo-format LoRA checkpoint loading and GQA support in TinyLlama.
@ -798,6 +823,7 @@ def test_gqa_nemo_lora(tmp_path):
class TestLlmError: class TestLlmError:
@pytest.mark.part3
def test_max_num_token_check(self): def test_max_num_token_check(self):
""" LLM should raise error when got prompt length exceed the valid range. """ """ LLM should raise error when got prompt length exceed the valid range. """
llm = LLM(llama_model_path, llm = LLM(llama_model_path,
@ -828,6 +854,7 @@ FailingExecutor = type(
@skip_ray @skip_ray
@pytest.mark.part2
def test_llm_with_proxy_error(): def test_llm_with_proxy_error():
"""Test that LLM properly handles GenerationExecutorWorker constructor failures. """Test that LLM properly handles GenerationExecutorWorker constructor failures.
@ -928,6 +955,7 @@ def test_llm_return_logprobs_streaming(prompt_logprobs, logprobs,
class TestLlmError: class TestLlmError:
@pytest.mark.part3
def test_max_num_token_check(self): def test_max_num_token_check(self):
""" LLM should raise error when got prompt length exceed the valid range. """ """ LLM should raise error when got prompt length exceed the valid range. """
llm = LLM(llama_model_path, llm = LLM(llama_model_path,