feat: Add stress test for TRT-LLM (#3250)

Signed-off-by: Wangshanshan <dominicw@nvidia.com>
2026-01-13 22:18:36 +08:00 · 2025-04-13 10:24:25 +08:00 · 2025-04-13 10:24:25 +08:00 · 5d3180be82
commit 5d3180be82
parent 74850c61e9
3 changed files with 1041 additions and 0 deletions
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@ -31,3 +31,4 @@ pytest-rerunfailures
 ruff==0.9.4
 lm_eval[api]==0.4.8
 docstring_parser
+genai-perf
--- a/tests/integration/defs/stress_test/stress_test.py
+++ b/tests/integration/defs/stress_test/stress_test.py
--- a/tests/integration/test_lists/test-db/l0_a10.yml
+++ b/tests/integration/test_lists/test-db/l0_a10.yml
@ -18,6 +18,8 @@ l0_a10:
  - disaggregated/test_disaggregated.py::test_disaggregated_cuda_graph[TinyLlama-1.1B-Chat-v1.0]
  - disaggregated/test_disaggregated.py::test_disaggregated_mixed[TinyLlama-1.1B-Chat-v1.0]
  - disaggregated/test_disaggregated.py::test_disaggregated_overlap[TinyLlama-1.1B-Chat-v1.0]
+  - stress_test/stress_test.py::test_run_stress_test[llama-v3-8b-instruct-hf_tp1-MAX_UTILIZATION-pytorch-stress-test]
+  - stress_test/stress_test.py::test_run_stress_test[llama-v3-8b-instruct-hf_tp1-GUARANTEED_NO_EVICT-pytorch-stress-stage-alone]
 - condition:
    ranges:
      system_gpu_count:
@ -108,6 +110,8 @@ l0_a10:
  - examples/test_whisper.py::test_llm_whisper_general[large-v3-disable_gemm_plugin-enable_attention_plugin-disable_weight_only-float16-nb:1-use_cpp_runtime]
  - examples/test_mamba.py::test_llm_mamba_1gpu[mamba2-130m-float16-enable_gemm_plugin]
  - examples/test_mamba.py::test_llm_mamba_1gpu[mamba-codestral-7B-v0.1-float16-enable_gemm_plugin] # 3 mins
+  - stress_test/stress_test.py::test_run_stress_test[llama-v3-8b-instruct-hf_tp1-MAX_UTILIZATION-trt-stress-test]
+  - stress_test/stress_test.py::test_run_stress_test[llama-v3-8b-instruct-hf_tp1-GUARANTEED_NO_EVICT-trt-stress-stage-alone]
 - condition:
    ranges:
      system_gpu_count: