mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-14 06:27:45 +08:00
[TRTLLM-9551][infra] Partition test_llm_pytorch.py for parallel execution (#10400)
Signed-off-by: Superjomn <328693+Superjomn@users.noreply.github.com>
This commit is contained in:
parent
ea380ff45c
commit
6b71b03947
@ -13,7 +13,10 @@ l0_a100:
|
|||||||
stage: pre_merge
|
stage: pre_merge
|
||||||
backend: "pytorch"
|
backend: "pytorch"
|
||||||
tests:
|
tests:
|
||||||
- unittest/llmapi/test_llm_pytorch.py
|
- unittest/llmapi/test_llm_pytorch.py -m "part0"
|
||||||
|
- unittest/llmapi/test_llm_pytorch.py -m "part1"
|
||||||
|
- unittest/llmapi/test_llm_pytorch.py -m "part2"
|
||||||
|
- unittest/llmapi/test_llm_pytorch.py -m "part3"
|
||||||
- unittest/llmapi/test_mpi_session.py ISOLATION
|
- unittest/llmapi/test_mpi_session.py ISOLATION
|
||||||
- unittest/llmapi/test_memory_profiling.py::test_profile_kvcache # profile kvcache for vision encoder
|
- unittest/llmapi/test_memory_profiling.py::test_profile_kvcache # profile kvcache for vision encoder
|
||||||
- unittest/llmapi/test_memory_profiling.py::test_pyexecutor_and_kvcache_share_execution_stream # test that PyExecutor and KVCacheManager share the same execution_stream
|
- unittest/llmapi/test_memory_profiling.py::test_pyexecutor_and_kvcache_share_execution_stream # test that PyExecutor and KVCacheManager share the same execution_stream
|
||||||
|
|||||||
@ -148,7 +148,10 @@ l0_h100:
|
|||||||
- disaggregated/test_disaggregated_single_gpu.py::test_disaggregated_simple_qwen3[False-False-Qwen3-8B-FP8]
|
- disaggregated/test_disaggregated_single_gpu.py::test_disaggregated_simple_qwen3[False-False-Qwen3-8B-FP8]
|
||||||
- unittest/_torch/executor
|
- unittest/_torch/executor
|
||||||
- unittest/_torch/ray_orchestrator/single_gpu
|
- unittest/_torch/ray_orchestrator/single_gpu
|
||||||
- unittest/llmapi/test_llm_pytorch.py
|
- unittest/llmapi/test_llm_pytorch.py -m "part0"
|
||||||
|
- unittest/llmapi/test_llm_pytorch.py -m "part1"
|
||||||
|
- unittest/llmapi/test_llm_pytorch.py -m "part2"
|
||||||
|
- unittest/llmapi/test_llm_pytorch.py -m "part3"
|
||||||
- unittest/llmapi/test_async_llm.py -m "not (gpu2 or gpu4)"
|
- unittest/llmapi/test_async_llm.py -m "not (gpu2 or gpu4)"
|
||||||
- examples/test_ray.py::test_llm_inference_async_ray
|
- examples/test_ray.py::test_llm_inference_async_ray
|
||||||
- condition:
|
- condition:
|
||||||
|
|||||||
@ -48,6 +48,7 @@ from transformers import AutoModelForCausalLM, AutoTokenizer
|
|||||||
|
|
||||||
@force_ampere
|
@force_ampere
|
||||||
@pytest.mark.parametrize("enable_chunked_prefill,", [False, True])
|
@pytest.mark.parametrize("enable_chunked_prefill,", [False, True])
|
||||||
|
@pytest.mark.part2
|
||||||
def test_tinyllama_logits_processor(enable_chunked_prefill):
|
def test_tinyllama_logits_processor(enable_chunked_prefill):
|
||||||
tinyllama_logits_processor_test_harness(
|
tinyllama_logits_processor_test_harness(
|
||||||
backend="pytorch", enable_chunked_prefill=enable_chunked_prefill)
|
backend="pytorch", enable_chunked_prefill=enable_chunked_prefill)
|
||||||
@ -62,6 +63,7 @@ def test_tinyllama_logits_processor(enable_chunked_prefill):
|
|||||||
(False, True, False, True),
|
(False, True, False, True),
|
||||||
(False, True, True, True),
|
(False, True, True, True),
|
||||||
])
|
])
|
||||||
|
@pytest.mark.part0
|
||||||
def test_llm_get_stats(return_context_logits, use_overlap,
|
def test_llm_get_stats(return_context_logits, use_overlap,
|
||||||
enable_chunked_prefill, enable_iter_req_stats):
|
enable_chunked_prefill, enable_iter_req_stats):
|
||||||
llm_get_stats_test_harness(tp_size=1,
|
llm_get_stats_test_harness(tp_size=1,
|
||||||
@ -82,6 +84,7 @@ def test_llm_get_stats(return_context_logits, use_overlap,
|
|||||||
(False, True, False, True),
|
(False, True, False, True),
|
||||||
(False, True, True, True),
|
(False, True, True, True),
|
||||||
])
|
])
|
||||||
|
@pytest.mark.part1
|
||||||
def test_llm_get_stats_async(return_context_logits, use_overlap,
|
def test_llm_get_stats_async(return_context_logits, use_overlap,
|
||||||
enable_chunked_prefill, enable_iter_req_stats):
|
enable_chunked_prefill, enable_iter_req_stats):
|
||||||
llm_get_stats_async_test_harness(
|
llm_get_stats_async_test_harness(
|
||||||
@ -94,6 +97,7 @@ def test_llm_get_stats_async(return_context_logits, use_overlap,
|
|||||||
enable_iter_req_stats=enable_iter_req_stats)
|
enable_iter_req_stats=enable_iter_req_stats)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.part1
|
||||||
def test_llm_capture_request_error():
|
def test_llm_capture_request_error():
|
||||||
_test_llm_capture_request_error(pytorch_backend=True, tp_size=1)
|
_test_llm_capture_request_error(pytorch_backend=True, tp_size=1)
|
||||||
|
|
||||||
@ -105,6 +109,7 @@ def test_llm_capture_request_error():
|
|||||||
[
|
[
|
||||||
SamplingParams() # pytorch only supports n=1
|
SamplingParams() # pytorch only supports n=1
|
||||||
])
|
])
|
||||||
|
@pytest.mark.part0
|
||||||
def test_llm_abort_request(sampling_params):
|
def test_llm_abort_request(sampling_params):
|
||||||
llm = LLM(model=llama_model_path, kv_cache_config=global_kvcache_config)
|
llm = LLM(model=llama_model_path, kv_cache_config=global_kvcache_config)
|
||||||
run_llm_abort_request(llm=llm, sampling_params=sampling_params)
|
run_llm_abort_request(llm=llm, sampling_params=sampling_params)
|
||||||
@ -118,6 +123,7 @@ def _validate_invalid_token_error_scope():
|
|||||||
|
|
||||||
|
|
||||||
@force_ampere
|
@force_ampere
|
||||||
|
@pytest.mark.part1
|
||||||
def test_llm_invalid_input_token():
|
def test_llm_invalid_input_token():
|
||||||
llm = LLM(model=llama_model_path, kv_cache_config=global_kvcache_config)
|
llm = LLM(model=llama_model_path, kv_cache_config=global_kvcache_config)
|
||||||
prompts = [
|
prompts = [
|
||||||
@ -136,6 +142,7 @@ def test_llm_invalid_input_token():
|
|||||||
|
|
||||||
|
|
||||||
@force_ampere
|
@force_ampere
|
||||||
|
@pytest.mark.part0
|
||||||
def test_llm_invalid_input_token_async():
|
def test_llm_invalid_input_token_async():
|
||||||
llm = LLM(model=llama_model_path, kv_cache_config=global_kvcache_config)
|
llm = LLM(model=llama_model_path, kv_cache_config=global_kvcache_config)
|
||||||
# NB: exc_info in _validate_invalid_token_error_scope creates a reference
|
# NB: exc_info in _validate_invalid_token_error_scope creates a reference
|
||||||
@ -167,6 +174,7 @@ def test_llm_invalid_input_token_async():
|
|||||||
futures[collect_idx].result()
|
futures[collect_idx].result()
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.part2
|
||||||
def test_llm_reward_model():
|
def test_llm_reward_model():
|
||||||
rm_model_path = get_model_path("Qwen2.5-Math-PRM-7B")
|
rm_model_path = get_model_path("Qwen2.5-Math-PRM-7B")
|
||||||
tokenizer = TransformersTokenizer.from_pretrained(rm_model_path)
|
tokenizer = TransformersTokenizer.from_pretrained(rm_model_path)
|
||||||
@ -188,6 +196,7 @@ def test_llm_reward_model():
|
|||||||
|
|
||||||
|
|
||||||
@skip_ray
|
@skip_ray
|
||||||
|
@pytest.mark.part3
|
||||||
def test_llm_perf_metrics():
|
def test_llm_perf_metrics():
|
||||||
with LLM(model=llama_model_path,
|
with LLM(model=llama_model_path,
|
||||||
kv_cache_config=global_kvcache_config) as llm:
|
kv_cache_config=global_kvcache_config) as llm:
|
||||||
@ -216,6 +225,7 @@ def test_llm_perf_metrics():
|
|||||||
|
|
||||||
|
|
||||||
@skip_ray
|
@skip_ray
|
||||||
|
@pytest.mark.part3
|
||||||
def test_llm_prometheus():
|
def test_llm_prometheus():
|
||||||
test_prompts = [
|
test_prompts = [
|
||||||
"Hello, my name is",
|
"Hello, my name is",
|
||||||
@ -239,6 +249,7 @@ def test_llm_prometheus():
|
|||||||
|
|
||||||
@skip_ray
|
@skip_ray
|
||||||
@pytest.mark.parametrize("streaming", [True, False])
|
@pytest.mark.parametrize("streaming", [True, False])
|
||||||
|
@pytest.mark.part3
|
||||||
def test_llm_with_postprocess_parallel_and_result_handler(streaming):
|
def test_llm_with_postprocess_parallel_and_result_handler(streaming):
|
||||||
run_llm_with_postprocess_parallel_and_result_handler(streaming,
|
run_llm_with_postprocess_parallel_and_result_handler(streaming,
|
||||||
"pytorch",
|
"pytorch",
|
||||||
@ -306,6 +317,7 @@ def llama_7b_lora_from_dir_test_harness(**llm_kwargs) -> None:
|
|||||||
|
|
||||||
|
|
||||||
@skip_gpu_memory_less_than_40gb
|
@skip_gpu_memory_less_than_40gb
|
||||||
|
@pytest.mark.part0
|
||||||
def test_llama_7b_lora():
|
def test_llama_7b_lora():
|
||||||
llama_7b_lora_from_dir_test_harness()
|
llama_7b_lora_from_dir_test_harness()
|
||||||
|
|
||||||
@ -368,6 +380,7 @@ def _check_llama_7b_multi_lora_evict_load_new_adapters(
|
|||||||
|
|
||||||
@skip_gpu_memory_less_than_40gb
|
@skip_gpu_memory_less_than_40gb
|
||||||
@skip_ray # https://nvbugs/5682551
|
@skip_ray # https://nvbugs/5682551
|
||||||
|
@pytest.mark.part3
|
||||||
def test_llama_7b_multi_lora_evict_and_reload_lora_gpu_cache():
|
def test_llama_7b_multi_lora_evict_and_reload_lora_gpu_cache():
|
||||||
"""Test eviction and re-loading a previously evicted adapter from the LoRA GPU cache, within a single
|
"""Test eviction and re-loading a previously evicted adapter from the LoRA GPU cache, within a single
|
||||||
llm.generate call, that's repeated twice.
|
llm.generate call, that's repeated twice.
|
||||||
@ -381,6 +394,7 @@ def test_llama_7b_multi_lora_evict_and_reload_lora_gpu_cache():
|
|||||||
|
|
||||||
|
|
||||||
@skip_gpu_memory_less_than_40gb
|
@skip_gpu_memory_less_than_40gb
|
||||||
|
@pytest.mark.part1
|
||||||
def test_llama_7b_multi_lora_evict_and_load_new_adapters_in_cpu_and_gpu_cache():
|
def test_llama_7b_multi_lora_evict_and_load_new_adapters_in_cpu_and_gpu_cache():
|
||||||
"""Test eviction and loading of new adapters in the evicted space, over several llm.generate calls, with LoRA GPU
|
"""Test eviction and loading of new adapters in the evicted space, over several llm.generate calls, with LoRA GPU
|
||||||
cache size < LoRA CPU cache size.
|
cache size < LoRA CPU cache size.
|
||||||
@ -394,6 +408,7 @@ def test_llama_7b_multi_lora_evict_and_load_new_adapters_in_cpu_and_gpu_cache():
|
|||||||
|
|
||||||
|
|
||||||
@skip_gpu_memory_less_than_40gb
|
@skip_gpu_memory_less_than_40gb
|
||||||
|
@pytest.mark.part0
|
||||||
def test_llama_7b_multi_lora_read_from_cache_after_insert():
|
def test_llama_7b_multi_lora_read_from_cache_after_insert():
|
||||||
"""Test that loading and then using the same adapters loaded in cache works."""
|
"""Test that loading and then using the same adapters loaded in cache works."""
|
||||||
_check_llama_7b_multi_lora_evict_load_new_adapters(
|
_check_llama_7b_multi_lora_evict_load_new_adapters(
|
||||||
@ -405,6 +420,7 @@ def test_llama_7b_multi_lora_read_from_cache_after_insert():
|
|||||||
|
|
||||||
|
|
||||||
@skip_gpu_memory_less_than_40gb
|
@skip_gpu_memory_less_than_40gb
|
||||||
|
@pytest.mark.part3
|
||||||
def test_llama_7b_multi_lora_evict_and_reload_evicted_adapters_in_cpu_and_gpu_cache(
|
def test_llama_7b_multi_lora_evict_and_reload_evicted_adapters_in_cpu_and_gpu_cache(
|
||||||
):
|
):
|
||||||
"""Test eviction, reloading new adapters and reloading previously evicted adapters from the LoRA CPU cache & GPU
|
"""Test eviction, reloading new adapters and reloading previously evicted adapters from the LoRA CPU cache & GPU
|
||||||
@ -427,6 +443,7 @@ def test_llama_7b_multi_lora_evict_and_reload_evicted_adapters_in_cpu_and_gpu_ca
|
|||||||
|
|
||||||
|
|
||||||
@skip_gpu_memory_less_than_40gb
|
@skip_gpu_memory_less_than_40gb
|
||||||
|
@pytest.mark.part2
|
||||||
def test_llama_7b_peft_cache_config_affects_peft_cache_size():
|
def test_llama_7b_peft_cache_config_affects_peft_cache_size():
|
||||||
"""Tests that LLM arg of peft_cache_config affects the peft cache sizes.
|
"""Tests that LLM arg of peft_cache_config affects the peft cache sizes.
|
||||||
|
|
||||||
@ -464,6 +481,7 @@ def test_llama_7b_peft_cache_config_affects_peft_cache_size():
|
|||||||
|
|
||||||
@skip_ray # https://nvbugs/5682551
|
@skip_ray # https://nvbugs/5682551
|
||||||
@skip_gpu_memory_less_than_40gb
|
@skip_gpu_memory_less_than_40gb
|
||||||
|
@pytest.mark.part1
|
||||||
def test_llama_7b_lora_config_overrides_peft_cache_config():
|
def test_llama_7b_lora_config_overrides_peft_cache_config():
|
||||||
"""Tests that cache size args in lora_config LLM arg override the cache size
|
"""Tests that cache size args in lora_config LLM arg override the cache size
|
||||||
parameters in peft_cache_config LLM arg.
|
parameters in peft_cache_config LLM arg.
|
||||||
@ -487,6 +505,7 @@ def test_llama_7b_lora_config_overrides_peft_cache_config():
|
|||||||
# https://jirasw.nvidia.com/browse/TRTLLM-5045
|
# https://jirasw.nvidia.com/browse/TRTLLM-5045
|
||||||
@pytest.mark.skip(reason="https://nvbugs/5448464")
|
@pytest.mark.skip(reason="https://nvbugs/5448464")
|
||||||
@skip_gpu_memory_less_than_138gb
|
@skip_gpu_memory_less_than_138gb
|
||||||
|
@pytest.mark.part1
|
||||||
def test_nemotron_nas_lora() -> None:
|
def test_nemotron_nas_lora() -> None:
|
||||||
lora_config = LoraConfig(lora_dir=[
|
lora_config = LoraConfig(lora_dir=[
|
||||||
f"{llm_models_root()}/nemotron-nas/Llama-3_3-Nemotron-Super-49B-v1-lora-adapter_r64"
|
f"{llm_models_root()}/nemotron-nas/Llama-3_3-Nemotron-Super-49B-v1-lora-adapter_r64"
|
||||||
@ -519,6 +538,7 @@ def test_nemotron_nas_lora() -> None:
|
|||||||
|
|
||||||
|
|
||||||
@skip_gpu_memory_less_than_80gb
|
@skip_gpu_memory_less_than_80gb
|
||||||
|
@pytest.mark.part0
|
||||||
def test_llama_3_1_8b_fp8_with_bf16_lora() -> None:
|
def test_llama_3_1_8b_fp8_with_bf16_lora() -> None:
|
||||||
skip_fp8_pre_ada(use_fp8=True)
|
skip_fp8_pre_ada(use_fp8=True)
|
||||||
model_dir = f"{llm_models_root()}/llama-3.1-model/Llama-3.1-8B-Instruct-FP8"
|
model_dir = f"{llm_models_root()}/llama-3.1-model/Llama-3.1-8B-Instruct-FP8"
|
||||||
@ -549,6 +569,7 @@ def test_llama_3_1_8b_fp8_with_bf16_lora() -> None:
|
|||||||
|
|
||||||
|
|
||||||
@skip_gpu_memory_less_than_80gb
|
@skip_gpu_memory_less_than_80gb
|
||||||
|
@pytest.mark.part2
|
||||||
def test_bielik_11b_v2_2_instruct_multi_lora() -> None:
|
def test_bielik_11b_v2_2_instruct_multi_lora() -> None:
|
||||||
model_dir = f"{llm_models_root()}/Bielik-11B-v2.2-Instruct"
|
model_dir = f"{llm_models_root()}/Bielik-11B-v2.2-Instruct"
|
||||||
|
|
||||||
@ -602,6 +623,7 @@ def test_bielik_11b_v2_2_instruct_multi_lora() -> None:
|
|||||||
assert len(outputs) == 2
|
assert len(outputs) == 2
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.part2
|
||||||
def test_gemma3_1b_instruct_multi_lora() -> None:
|
def test_gemma3_1b_instruct_multi_lora() -> None:
|
||||||
model_dir = f"{llm_models_root()}/gemma/gemma-3-1b-it"
|
model_dir = f"{llm_models_root()}/gemma/gemma-3-1b-it"
|
||||||
|
|
||||||
@ -666,6 +688,7 @@ def test_gemma3_1b_instruct_multi_lora() -> None:
|
|||||||
(16, 16, "rank_16"),
|
(16, 16, "rank_16"),
|
||||||
(4, 8, "rank_4_max_8"),
|
(4, 8, "rank_4_max_8"),
|
||||||
])
|
])
|
||||||
|
@pytest.mark.part3
|
||||||
def test_load_torch_nemo_lora_function(tmp_path, lora_rank, max_lora_rank,
|
def test_load_torch_nemo_lora_function(tmp_path, lora_rank, max_lora_rank,
|
||||||
description):
|
description):
|
||||||
"""Test load_torch_nemo_lora function with different LoRA rank configurations."""
|
"""Test load_torch_nemo_lora function with different LoRA rank configurations."""
|
||||||
@ -695,6 +718,7 @@ def test_load_torch_nemo_lora_function(tmp_path, lora_rank, max_lora_rank,
|
|||||||
}, f"Expected correct module mapping for {description}"
|
}, f"Expected correct module mapping for {description}"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.part0
|
||||||
def test_nemo_lora_unsupported_modules_validation(tmp_path):
|
def test_nemo_lora_unsupported_modules_validation(tmp_path):
|
||||||
"""Test validation of unsupported modules in NeMo LoRA."""
|
"""Test validation of unsupported modules in NeMo LoRA."""
|
||||||
from tensorrt_llm.lora_manager import load_torch_nemo_lora
|
from tensorrt_llm.lora_manager import load_torch_nemo_lora
|
||||||
@ -720,6 +744,7 @@ def test_nemo_lora_unsupported_modules_validation(tmp_path):
|
|||||||
|
|
||||||
|
|
||||||
@force_ampere
|
@force_ampere
|
||||||
|
@pytest.mark.part1
|
||||||
def test_gqa_nemo_lora(tmp_path):
|
def test_gqa_nemo_lora(tmp_path):
|
||||||
"""
|
"""
|
||||||
Test NeMo-format LoRA checkpoint loading and GQA support in TinyLlama.
|
Test NeMo-format LoRA checkpoint loading and GQA support in TinyLlama.
|
||||||
@ -798,6 +823,7 @@ def test_gqa_nemo_lora(tmp_path):
|
|||||||
|
|
||||||
class TestLlmError:
|
class TestLlmError:
|
||||||
|
|
||||||
|
@pytest.mark.part3
|
||||||
def test_max_num_token_check(self):
|
def test_max_num_token_check(self):
|
||||||
""" LLM should raise error when got prompt length exceed the valid range. """
|
""" LLM should raise error when got prompt length exceed the valid range. """
|
||||||
llm = LLM(llama_model_path,
|
llm = LLM(llama_model_path,
|
||||||
@ -828,6 +854,7 @@ FailingExecutor = type(
|
|||||||
|
|
||||||
|
|
||||||
@skip_ray
|
@skip_ray
|
||||||
|
@pytest.mark.part2
|
||||||
def test_llm_with_proxy_error():
|
def test_llm_with_proxy_error():
|
||||||
"""Test that LLM properly handles GenerationExecutorWorker constructor failures.
|
"""Test that LLM properly handles GenerationExecutorWorker constructor failures.
|
||||||
|
|
||||||
@ -928,6 +955,7 @@ def test_llm_return_logprobs_streaming(prompt_logprobs, logprobs,
|
|||||||
|
|
||||||
class TestLlmError:
|
class TestLlmError:
|
||||||
|
|
||||||
|
@pytest.mark.part3
|
||||||
def test_max_num_token_check(self):
|
def test_max_num_token_check(self):
|
||||||
""" LLM should raise error when got prompt length exceed the valid range. """
|
""" LLM should raise error when got prompt length exceed the valid range. """
|
||||||
llm = LLM(llama_model_path,
|
llm = LLM(llama_model_path,
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user