mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-13 22:18:36 +08:00
[TRTLLM-9551][infra] Partition test_llm_pytorch.py for parallel execution (#10400)
Signed-off-by: Superjomn <328693+Superjomn@users.noreply.github.com>
This commit is contained in:
parent
ea380ff45c
commit
6b71b03947
@ -13,7 +13,10 @@ l0_a100:
|
||||
stage: pre_merge
|
||||
backend: "pytorch"
|
||||
tests:
|
||||
- unittest/llmapi/test_llm_pytorch.py
|
||||
- unittest/llmapi/test_llm_pytorch.py -m "part0"
|
||||
- unittest/llmapi/test_llm_pytorch.py -m "part1"
|
||||
- unittest/llmapi/test_llm_pytorch.py -m "part2"
|
||||
- unittest/llmapi/test_llm_pytorch.py -m "part3"
|
||||
- unittest/llmapi/test_mpi_session.py ISOLATION
|
||||
- unittest/llmapi/test_memory_profiling.py::test_profile_kvcache # profile kvcache for vision encoder
|
||||
- unittest/llmapi/test_memory_profiling.py::test_pyexecutor_and_kvcache_share_execution_stream # test that PyExecutor and KVCacheManager share the same execution_stream
|
||||
|
||||
@ -148,7 +148,10 @@ l0_h100:
|
||||
- disaggregated/test_disaggregated_single_gpu.py::test_disaggregated_simple_qwen3[False-False-Qwen3-8B-FP8]
|
||||
- unittest/_torch/executor
|
||||
- unittest/_torch/ray_orchestrator/single_gpu
|
||||
- unittest/llmapi/test_llm_pytorch.py
|
||||
- unittest/llmapi/test_llm_pytorch.py -m "part0"
|
||||
- unittest/llmapi/test_llm_pytorch.py -m "part1"
|
||||
- unittest/llmapi/test_llm_pytorch.py -m "part2"
|
||||
- unittest/llmapi/test_llm_pytorch.py -m "part3"
|
||||
- unittest/llmapi/test_async_llm.py -m "not (gpu2 or gpu4)"
|
||||
- examples/test_ray.py::test_llm_inference_async_ray
|
||||
- condition:
|
||||
|
||||
@ -48,6 +48,7 @@ from transformers import AutoModelForCausalLM, AutoTokenizer
|
||||
|
||||
@force_ampere
|
||||
@pytest.mark.parametrize("enable_chunked_prefill,", [False, True])
|
||||
@pytest.mark.part2
|
||||
def test_tinyllama_logits_processor(enable_chunked_prefill):
|
||||
tinyllama_logits_processor_test_harness(
|
||||
backend="pytorch", enable_chunked_prefill=enable_chunked_prefill)
|
||||
@ -62,6 +63,7 @@ def test_tinyllama_logits_processor(enable_chunked_prefill):
|
||||
(False, True, False, True),
|
||||
(False, True, True, True),
|
||||
])
|
||||
@pytest.mark.part0
|
||||
def test_llm_get_stats(return_context_logits, use_overlap,
|
||||
enable_chunked_prefill, enable_iter_req_stats):
|
||||
llm_get_stats_test_harness(tp_size=1,
|
||||
@ -82,6 +84,7 @@ def test_llm_get_stats(return_context_logits, use_overlap,
|
||||
(False, True, False, True),
|
||||
(False, True, True, True),
|
||||
])
|
||||
@pytest.mark.part1
|
||||
def test_llm_get_stats_async(return_context_logits, use_overlap,
|
||||
enable_chunked_prefill, enable_iter_req_stats):
|
||||
llm_get_stats_async_test_harness(
|
||||
@ -94,6 +97,7 @@ def test_llm_get_stats_async(return_context_logits, use_overlap,
|
||||
enable_iter_req_stats=enable_iter_req_stats)
|
||||
|
||||
|
||||
@pytest.mark.part1
|
||||
def test_llm_capture_request_error():
|
||||
_test_llm_capture_request_error(pytorch_backend=True, tp_size=1)
|
||||
|
||||
@ -105,6 +109,7 @@ def test_llm_capture_request_error():
|
||||
[
|
||||
SamplingParams() # pytorch only supports n=1
|
||||
])
|
||||
@pytest.mark.part0
|
||||
def test_llm_abort_request(sampling_params):
|
||||
llm = LLM(model=llama_model_path, kv_cache_config=global_kvcache_config)
|
||||
run_llm_abort_request(llm=llm, sampling_params=sampling_params)
|
||||
@ -118,6 +123,7 @@ def _validate_invalid_token_error_scope():
|
||||
|
||||
|
||||
@force_ampere
|
||||
@pytest.mark.part1
|
||||
def test_llm_invalid_input_token():
|
||||
llm = LLM(model=llama_model_path, kv_cache_config=global_kvcache_config)
|
||||
prompts = [
|
||||
@ -136,6 +142,7 @@ def test_llm_invalid_input_token():
|
||||
|
||||
|
||||
@force_ampere
|
||||
@pytest.mark.part0
|
||||
def test_llm_invalid_input_token_async():
|
||||
llm = LLM(model=llama_model_path, kv_cache_config=global_kvcache_config)
|
||||
# NB: exc_info in _validate_invalid_token_error_scope creates a reference
|
||||
@ -167,6 +174,7 @@ def test_llm_invalid_input_token_async():
|
||||
futures[collect_idx].result()
|
||||
|
||||
|
||||
@pytest.mark.part2
|
||||
def test_llm_reward_model():
|
||||
rm_model_path = get_model_path("Qwen2.5-Math-PRM-7B")
|
||||
tokenizer = TransformersTokenizer.from_pretrained(rm_model_path)
|
||||
@ -188,6 +196,7 @@ def test_llm_reward_model():
|
||||
|
||||
|
||||
@skip_ray
|
||||
@pytest.mark.part3
|
||||
def test_llm_perf_metrics():
|
||||
with LLM(model=llama_model_path,
|
||||
kv_cache_config=global_kvcache_config) as llm:
|
||||
@ -216,6 +225,7 @@ def test_llm_perf_metrics():
|
||||
|
||||
|
||||
@skip_ray
|
||||
@pytest.mark.part3
|
||||
def test_llm_prometheus():
|
||||
test_prompts = [
|
||||
"Hello, my name is",
|
||||
@ -239,6 +249,7 @@ def test_llm_prometheus():
|
||||
|
||||
@skip_ray
|
||||
@pytest.mark.parametrize("streaming", [True, False])
|
||||
@pytest.mark.part3
|
||||
def test_llm_with_postprocess_parallel_and_result_handler(streaming):
|
||||
run_llm_with_postprocess_parallel_and_result_handler(streaming,
|
||||
"pytorch",
|
||||
@ -306,6 +317,7 @@ def llama_7b_lora_from_dir_test_harness(**llm_kwargs) -> None:
|
||||
|
||||
|
||||
@skip_gpu_memory_less_than_40gb
|
||||
@pytest.mark.part0
|
||||
def test_llama_7b_lora():
|
||||
llama_7b_lora_from_dir_test_harness()
|
||||
|
||||
@ -368,6 +380,7 @@ def _check_llama_7b_multi_lora_evict_load_new_adapters(
|
||||
|
||||
@skip_gpu_memory_less_than_40gb
|
||||
@skip_ray # https://nvbugs/5682551
|
||||
@pytest.mark.part3
|
||||
def test_llama_7b_multi_lora_evict_and_reload_lora_gpu_cache():
|
||||
"""Test eviction and re-loading a previously evicted adapter from the LoRA GPU cache, within a single
|
||||
llm.generate call, that's repeated twice.
|
||||
@ -381,6 +394,7 @@ def test_llama_7b_multi_lora_evict_and_reload_lora_gpu_cache():
|
||||
|
||||
|
||||
@skip_gpu_memory_less_than_40gb
|
||||
@pytest.mark.part1
|
||||
def test_llama_7b_multi_lora_evict_and_load_new_adapters_in_cpu_and_gpu_cache():
|
||||
"""Test eviction and loading of new adapters in the evicted space, over several llm.generate calls, with LoRA GPU
|
||||
cache size < LoRA CPU cache size.
|
||||
@ -394,6 +408,7 @@ def test_llama_7b_multi_lora_evict_and_load_new_adapters_in_cpu_and_gpu_cache():
|
||||
|
||||
|
||||
@skip_gpu_memory_less_than_40gb
|
||||
@pytest.mark.part0
|
||||
def test_llama_7b_multi_lora_read_from_cache_after_insert():
|
||||
"""Test that loading and then using the same adapters loaded in cache works."""
|
||||
_check_llama_7b_multi_lora_evict_load_new_adapters(
|
||||
@ -405,6 +420,7 @@ def test_llama_7b_multi_lora_read_from_cache_after_insert():
|
||||
|
||||
|
||||
@skip_gpu_memory_less_than_40gb
|
||||
@pytest.mark.part3
|
||||
def test_llama_7b_multi_lora_evict_and_reload_evicted_adapters_in_cpu_and_gpu_cache(
|
||||
):
|
||||
"""Test eviction, reloading new adapters and reloading previously evicted adapters from the LoRA CPU cache & GPU
|
||||
@ -427,6 +443,7 @@ def test_llama_7b_multi_lora_evict_and_reload_evicted_adapters_in_cpu_and_gpu_ca
|
||||
|
||||
|
||||
@skip_gpu_memory_less_than_40gb
|
||||
@pytest.mark.part2
|
||||
def test_llama_7b_peft_cache_config_affects_peft_cache_size():
|
||||
"""Tests that LLM arg of peft_cache_config affects the peft cache sizes.
|
||||
|
||||
@ -464,6 +481,7 @@ def test_llama_7b_peft_cache_config_affects_peft_cache_size():
|
||||
|
||||
@skip_ray # https://nvbugs/5682551
|
||||
@skip_gpu_memory_less_than_40gb
|
||||
@pytest.mark.part1
|
||||
def test_llama_7b_lora_config_overrides_peft_cache_config():
|
||||
"""Tests that cache size args in lora_config LLM arg override the cache size
|
||||
parameters in peft_cache_config LLM arg.
|
||||
@ -487,6 +505,7 @@ def test_llama_7b_lora_config_overrides_peft_cache_config():
|
||||
# https://jirasw.nvidia.com/browse/TRTLLM-5045
|
||||
@pytest.mark.skip(reason="https://nvbugs/5448464")
|
||||
@skip_gpu_memory_less_than_138gb
|
||||
@pytest.mark.part1
|
||||
def test_nemotron_nas_lora() -> None:
|
||||
lora_config = LoraConfig(lora_dir=[
|
||||
f"{llm_models_root()}/nemotron-nas/Llama-3_3-Nemotron-Super-49B-v1-lora-adapter_r64"
|
||||
@ -519,6 +538,7 @@ def test_nemotron_nas_lora() -> None:
|
||||
|
||||
|
||||
@skip_gpu_memory_less_than_80gb
|
||||
@pytest.mark.part0
|
||||
def test_llama_3_1_8b_fp8_with_bf16_lora() -> None:
|
||||
skip_fp8_pre_ada(use_fp8=True)
|
||||
model_dir = f"{llm_models_root()}/llama-3.1-model/Llama-3.1-8B-Instruct-FP8"
|
||||
@ -549,6 +569,7 @@ def test_llama_3_1_8b_fp8_with_bf16_lora() -> None:
|
||||
|
||||
|
||||
@skip_gpu_memory_less_than_80gb
|
||||
@pytest.mark.part2
|
||||
def test_bielik_11b_v2_2_instruct_multi_lora() -> None:
|
||||
model_dir = f"{llm_models_root()}/Bielik-11B-v2.2-Instruct"
|
||||
|
||||
@ -602,6 +623,7 @@ def test_bielik_11b_v2_2_instruct_multi_lora() -> None:
|
||||
assert len(outputs) == 2
|
||||
|
||||
|
||||
@pytest.mark.part2
|
||||
def test_gemma3_1b_instruct_multi_lora() -> None:
|
||||
model_dir = f"{llm_models_root()}/gemma/gemma-3-1b-it"
|
||||
|
||||
@ -666,6 +688,7 @@ def test_gemma3_1b_instruct_multi_lora() -> None:
|
||||
(16, 16, "rank_16"),
|
||||
(4, 8, "rank_4_max_8"),
|
||||
])
|
||||
@pytest.mark.part3
|
||||
def test_load_torch_nemo_lora_function(tmp_path, lora_rank, max_lora_rank,
|
||||
description):
|
||||
"""Test load_torch_nemo_lora function with different LoRA rank configurations."""
|
||||
@ -695,6 +718,7 @@ def test_load_torch_nemo_lora_function(tmp_path, lora_rank, max_lora_rank,
|
||||
}, f"Expected correct module mapping for {description}"
|
||||
|
||||
|
||||
@pytest.mark.part0
|
||||
def test_nemo_lora_unsupported_modules_validation(tmp_path):
|
||||
"""Test validation of unsupported modules in NeMo LoRA."""
|
||||
from tensorrt_llm.lora_manager import load_torch_nemo_lora
|
||||
@ -720,6 +744,7 @@ def test_nemo_lora_unsupported_modules_validation(tmp_path):
|
||||
|
||||
|
||||
@force_ampere
|
||||
@pytest.mark.part1
|
||||
def test_gqa_nemo_lora(tmp_path):
|
||||
"""
|
||||
Test NeMo-format LoRA checkpoint loading and GQA support in TinyLlama.
|
||||
@ -798,6 +823,7 @@ def test_gqa_nemo_lora(tmp_path):
|
||||
|
||||
class TestLlmError:
|
||||
|
||||
@pytest.mark.part3
|
||||
def test_max_num_token_check(self):
|
||||
""" LLM should raise error when got prompt length exceed the valid range. """
|
||||
llm = LLM(llama_model_path,
|
||||
@ -828,6 +854,7 @@ FailingExecutor = type(
|
||||
|
||||
|
||||
@skip_ray
|
||||
@pytest.mark.part2
|
||||
def test_llm_with_proxy_error():
|
||||
"""Test that LLM properly handles GenerationExecutorWorker constructor failures.
|
||||
|
||||
@ -928,6 +955,7 @@ def test_llm_return_logprobs_streaming(prompt_logprobs, logprobs,
|
||||
|
||||
class TestLlmError:
|
||||
|
||||
@pytest.mark.part3
|
||||
def test_max_num_token_check(self):
|
||||
""" LLM should raise error when got prompt length exceed the valid range. """
|
||||
llm = LLM(llama_model_path,
|
||||
|
||||
Loading…
Reference in New Issue
Block a user