move pytorch tests of LLM API into separate test files (#3745)

* move pytorch tests of LLM API into separate test files Signed-off-by: junq <22017000+QiJune@users.noreply.github.com> * polish Signed-off-by: junq <22017000+QiJune@users.noreply.github.com> * fix ci Signed-off-by: junq <22017000+QiJune@users.noreply.github.com> * fix ci Signed-off-by: junq <22017000+QiJune@users.noreply.github.com> * fix ci Signed-off-by: junq <22017000+QiJune@users.noreply.github.com> * update Signed-off-by: junq <22017000+QiJune@users.noreply.github.com> * clean Signed-off-by: junq <22017000+QiJune@users.noreply.github.com> * fix Signed-off-by: junq <22017000+QiJune@users.noreply.github.com> --------- Signed-off-by: junq <22017000+QiJune@users.noreply.github.com>
2026-01-14 06:27:45 +08:00 · 2025-04-22 14:36:59 -07:00 · 2025-04-22 14:36:59 -07:00 · 257abfbc51
commit 257abfbc51
parent b16a127026
8 changed files with 135 additions and 77 deletions
--- a/jenkins/L0_MergeRequest.groovy
+++ b/jenkins/L0_MergeRequest.groovy
@ -524,6 +524,8 @@ def getMultiGpuFileChanged(pipeline, testFilter, globalVars)
        "tests/integration/test_lists/test-db/l0_dgx_h200.yml",
        "tests/unittest/_torch/multi_gpu/",
        "tests/unittest/_torch/multi_gpu_modeling/",
+        "tests/unittest/llmapi/test_llm_pytorch.py",
+        "tests/unittest/llmapi/test_llm_multi_gpu_pytorch.py",
        "jenkins/L0_Test.groovy",
    ]

--- a/jenkins/L0_Test.groovy
+++ b/jenkins/L0_Test.groovy
@ -1199,6 +1199,7 @@ def launchTestJobs(pipeline, testFilter, dockerNode=null)
        "A30-TensorRT-2": ["a30", "l0_a30", 2, 4],
        "A30-TensorRT-3": ["a30", "l0_a30", 3, 4],
        "A30-TensorRT-4": ["a30", "l0_a30", 4, 4],
+        "A100X-PyTorch-1": ["a100x", "l0_a100", 1, 1],
        "A100X-TensorRT-1": ["a100x", "l0_a100", 1, 4],
        "A100X-TensorRT-2": ["a100x", "l0_a100", 2, 4],
        "A100X-TensorRT-3": ["a100x", "l0_a100", 3, 4],
--- a/tests/integration/test_lists/test-db/l0_a100.yml
+++ b/tests/integration/test_lists/test-db/l0_a100.yml
@ -1,5 +1,19 @@
 version: 0.0.1
 l0_a100:
+- condition:
+    ranges:
+      system_gpu_count:
+        gte: 1
+        lte: 1
+    wildcards:
+      gpu:
+      - '*a100*'
+      linux_distribution_name: ubuntu*
+    terms:
+      stage: pre_merge
+      backend: "pytorch"
+  tests:
+    - unittest/llmapi/test_llm_pytorch.py
 - condition:
    ranges:
      system_gpu_count:
--- a/tests/integration/test_lists/test-db/l0_dgx_h100.yml
+++ b/tests/integration/test_lists/test-db/l0_dgx_h100.yml
@ -16,6 +16,7 @@ l0_dgx_h100:
  # ------------- PyTorch tests ---------------
  - unittest/_torch/multi_gpu
  - unittest/_torch/auto_deploy/unit/multigpu
+  - unittest/llmapi/test_llm_multi_gpu_pytorch.py -m "gpu4"
  - accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_bfloat16_4gpus[tp4-attn_backend=TRTLLM]
  - accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_bfloat16_4gpus[tp4-attn_backend=TRTLLM-torch_compile]
  - accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_bfloat16_4gpus[tp2pp2-attn_backend=TRTLLM]
--- a/tests/unittest/llmapi/test_llm.py
+++ b/tests/unittest/llmapi/test_llm.py
@ -1099,12 +1099,8 @@ def tinyllama_guided_decoding_test_harness(**llm_kwargs):

@force_ampere
@pytest.mark.part0
-@pytest.mark.parametrize("backend", ['tensorrt', 'pytorch'])
-def test_tinyllama_guided_decoding(backend: str):
-    llm_kwargs = {}
-    if backend == 'pytorch':
-        llm_kwargs['backend'] = 'pytorch'
-    tinyllama_guided_decoding_test_harness(**llm_kwargs)
+def test_tinyllama_guided_decoding():
+    tinyllama_guided_decoding_test_harness()


@pytest.mark.part0
@ -1766,18 +1762,13 @@ def llm_get_stats_test_harness(tp_size: int = 1,
    assert llm.get_stats(2)


-@pytest.mark.parametrize("return_context_logits, pytorch_backend, use_overlap",
-                         [
-                             (True, False, False),
-                             (False, False, False),
-                             (False, True, False),
-                             (False, True, True),
-                         ])
-def test_llm_get_stats(return_context_logits, pytorch_backend, use_overlap):
+@pytest.mark.parametrize("return_context_logits", [
+    (True, ),
+    (False, ),
+])
+def test_llm_get_stats(return_context_logits):
    llm_get_stats_test_harness(tp_size=1,
-                               return_context_logits=return_context_logits,
-                               pytorch_backend=pytorch_backend,
-                               use_overlap=use_overlap)
+                               return_context_logits=return_context_logits)


 def llm_get_stats_async_test_harness(tp_size: int = 1,
@ -1833,20 +1824,15 @@ def llm_get_stats_async_test_harness(tp_size: int = 1,
    asyncio.run(main())


-@pytest.mark.parametrize("return_context_logits, pytorch_backend, use_overlap",
-                         [
-                             (True, False, False),
-                             (False, False, False),
-                             (False, True, False),
-                             (False, True, True),
-                         ])
-def test_llm_get_stats_async(return_context_logits, pytorch_backend,
-                             use_overlap):
+@pytest.mark.parametrize("return_context_logits", [
+    (True, ),
+    (False, ),
+])
+def test_llm_get_stats_async(return_context_logits):
    llm_get_stats_async_test_harness(
        tp_size=1,
        return_context_logits=return_context_logits,
-        pytorch_backend=pytorch_backend,
-        use_overlap=use_overlap)
+    )


 def test_llm_chunked_prefill():
@ -1986,10 +1972,9 @@ def run_llm_with_postprocess_parallel_and_result_handler(


@pytest.mark.parametrize("streaming", [True, False])
-@pytest.mark.parametrize("backend", [None, "pytorch"])
-def test_llm_with_postprocess_parallel_and_result_handler(streaming, backend):
+def test_llm_with_postprocess_parallel_and_result_handler(streaming):
    run_llm_with_postprocess_parallel_and_result_handler(streaming,
-                                                         backend,
+                                                         backend=None,
                                                         tp_size=1)


@ -2041,41 +2026,6 @@ def test_llm_abort_request(llm_for_sampling_params,
                          sampling_params=sampling_params)


-@force_ampere
-@pytest.mark.parametrize(
-    "sampling_params",
-    [
-        SamplingParams()  # pytorch only supports n=1
-    ])
-def test_llm_abort_request_pytorch(sampling_params):
-    from tensorrt_llm._torch import LLM as LLM_torch
-    llm = LLM_torch(model=llama_model_path,
-                    kv_cache_config=global_kvcache_config)
-    run_llm_abort_request(llm=llm, sampling_params=sampling_params)
-
-
-def test_llm_reward_model_pytorch():
-    rm_model_path = get_model_path("Qwen2.5-Math-PRM-7B")
-    tokenizer = TransformersTokenizer.from_pretrained(rm_model_path)
-    tokenized_input = tokenizer(prompts, return_tensors="pt")["input_ids"]
-
-    from tensorrt_llm._torch import LLM as LLM_torch
-    from tensorrt_llm._torch.pyexecutor.config import PyTorchConfig
-    llm = LLM_torch(
-        model=rm_model_path,
-        pytorch_backend_config=PyTorchConfig(attn_backend="VANILLA"))
-
-    sampling_params = SamplingParams(return_context_logits=True)
-
-    outputs = llm.generate(prompts, sampling_params)
-    scores = outputs[0].context_logits
-
-    print(scores)
-
-    assert scores.shape == (tokenized_input.shape[1], 2)
-    assert not outputs[0].outputs[0].text
-
-
 def test_llm_sampling_params_n_lt_max_batch_size():
    sampling_params = SamplingParams(n=2, best_of=1)
    build_config = BuildConfig(max_batch_size=1, max_seq_len=1024)
@ -2117,7 +2067,3 @@ def test_llm_api_draft_target():
        prompt = output.prompt
        generated_text = output.outputs[0].text
        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
-
-
-if __name__ == '__main__':
-    test_llm_with_postprocess_parallel_and_result_handler(True, "pytorch")
--- a/tests/unittest/llmapi/test_llm_multi_gpu.py
+++ b/tests/unittest/llmapi/test_llm_multi_gpu.py
@ -234,16 +234,11 @@ def test_tinyllama_logits_processor_tp2pp2():

@pytest.mark.gpu4
@pytest.mark.part0
-@pytest.mark.parametrize("backend", ['tensorrt', 'pytorch'])
-def test_tinyllama_guided_decoding_tp2pp2(backend: str):
-    llm_kwargs = {}
-    if backend == 'pytorch':
-        llm_kwargs['backend'] = 'pytorch'
+def test_tinyllama_guided_decoding_tp2pp2():
    tinyllama_guided_decoding_test_harness(
        tensor_parallel_size=2,
        pipeline_parallel_size=2,
-        kv_cache_config=global_kv_cache_config,
-        **llm_kwargs)
+        kv_cache_config=global_kv_cache_config)


@pytest.mark.gpu2
--- a/tests/unittest/llmapi/test_llm_multi_gpu_pytorch.py
+++ b/tests/unittest/llmapi/test_llm_multi_gpu_pytorch.py
@ -0,0 +1,15 @@
+import pytest
+
+# isort: off
+from .test_llm import (global_kvcache_config,
+                       tinyllama_guided_decoding_test_harness)
+# isort: on
+
+
+@pytest.mark.gpu4
+def test_tinyllama_guided_decoding_tp2pp2():
+    tinyllama_guided_decoding_test_harness(
+        tensor_parallel_size=2,
+        pipeline_parallel_size=2,
+        kv_cache_config=global_kvcache_config,
+        backend='pytorch')
--- a/tests/unittest/llmapi/test_llm_pytorch.py
+++ b/tests/unittest/llmapi/test_llm_pytorch.py
@ -0,0 +1,84 @@
+import pytest
+
+from tensorrt_llm.llmapi.tokenizer import TransformersTokenizer
+from tensorrt_llm.sampling_params import SamplingParams
+
+# isort: off
+from .test_llm import (get_model_path, global_kvcache_config, llama_model_path,
+                       llm_get_stats_async_test_harness,
+                       llm_get_stats_test_harness, prompts,
+                       run_llm_abort_request,
+                       run_llm_with_postprocess_parallel_and_result_handler,
+                       tinyllama_guided_decoding_test_harness)
+from utils.util import force_ampere
+# isort: on
+
+
+@force_ampere
+def test_tinyllama_guided_decoding():
+    tinyllama_guided_decoding_test_harness(backend="pytorch")
+
+
+@pytest.mark.parametrize("return_context_logits, use_overlap", [
+    (False, False),
+    (False, True),
+])
+def test_llm_get_stats(return_context_logits, use_overlap):
+    llm_get_stats_test_harness(tp_size=1,
+                               return_context_logits=return_context_logits,
+                               pytorch_backend=True,
+                               use_overlap=use_overlap)
+
+
+@pytest.mark.parametrize("return_context_logits, use_overlap", [
+    (False, False),
+    (False, True),
+])
+def test_llm_get_stats_async(return_context_logits, use_overlap):
+    llm_get_stats_async_test_harness(
+        tp_size=1,
+        return_context_logits=return_context_logits,
+        pytorch_backend=True,
+        use_overlap=use_overlap)
+
+
+@force_ampere
+@pytest.mark.parametrize(
+    "sampling_params",
+    [
+        SamplingParams()  # pytorch only supports n=1
+    ])
+def test_llm_abort_request(sampling_params):
+    from tensorrt_llm._torch import LLM as LLM_torch
+    llm = LLM_torch(model=llama_model_path,
+                    kv_cache_config=global_kvcache_config)
+    run_llm_abort_request(llm=llm, sampling_params=sampling_params)
+
+
+def test_llm_reward_model():
+    rm_model_path = get_model_path("Qwen2.5-Math-PRM-7B")
+    tokenizer = TransformersTokenizer.from_pretrained(rm_model_path)
+    tokenized_input = tokenizer(prompts, return_tensors="pt")["input_ids"]
+
+    from tensorrt_llm._torch import LLM as LLM_torch
+    from tensorrt_llm._torch.pyexecutor.config import PyTorchConfig
+    llm = LLM_torch(
+        model=rm_model_path,
+        pytorch_backend_config=PyTorchConfig(attn_backend="VANILLA"))
+
+    sampling_params = SamplingParams(return_context_logits=True)
+
+    outputs = llm.generate(prompts, sampling_params)
+    scores = outputs[0].context_logits
+
+    print(scores)
+
+    assert scores.shape == (tokenized_input.shape[1], 2)
+    assert not outputs[0].outputs[0].text
+
+
+@pytest.mark.parametrize("streaming", [True, False])
+def test_llm_with_postprocess_parallel_and_result_handler(streaming):
+    run_llm_with_postprocess_parallel_and_result_handler(streaming,
+                                                         "pytorch",
+                                                         tp_size=1)