chore: increase A30 for cpp test (#3811)

* increase A30 for cpp test Signed-off-by: junq <22017000+QiJune@users.noreply.github.com> * enable parallel run test for gpt_executor Signed-off-by: junq <22017000+QiJune@users.noreply.github.com> * clean Signed-off-by: junq <22017000+QiJune@users.noreply.github.com> * decrease freeGpuMemoryFraction of cpp tests Signed-off-by: junq <22017000+QiJune@users.noreply.github.com> * fix Signed-off-by: junq <22017000+QiJune@users.noreply.github.com> --------- Signed-off-by: junq <22017000+QiJune@users.noreply.github.com>
2026-02-16 07:53:55 +08:00 · 2025-04-24 16:34:39 -07:00 · 2025-04-24 16:34:39 -07:00 · 991939a0f4
commit 991939a0f4
parent d72add1794
6 changed files with 45 additions and 39 deletions
--- a/cpp/tests/batch_manager/trtGptModelRealDecoderTest.cpp
+++ b/cpp/tests/batch_manager/trtGptModelRealDecoderTest.cpp
@ -1066,7 +1066,7 @@ INSTANTIATE_TEST_SUITE_P(GptV1Tests, ParamTest,
            BeamConfig{1, {1}}, BeamConfig{2, {2}} // , BeamConfig{2, {1, 2}}
            ),
        testing::Values(std::nullopt),             // maxTokensInPagedKvCache
-        testing::Values(std::nullopt, 0.8),        // freeGpuMemoryFraction
+        testing::Values(0.4),                      // freeGpuMemoryFraction
        testing::Values(false),                    // enableTrtOverlap
        testing::Values(false),                    // enableChunkedContext
        testing::Values(false),                    // enableStreamingMode
@ -1092,7 +1092,7 @@ INSTANTIATE_TEST_SUITE_P(GptV1RandomEndIdTests, ParamTest,
            BeamConfig{1, {1}}, BeamConfig{2, {2}} // , BeamConfig{2, {1, 2}}
            ),
        testing::Values(std::nullopt),             // maxTokensInPagedKvCache
-        testing::Values(std::nullopt, 0.8),        // freeGpuMemoryFraction
+        testing::Values(std::nullopt, 0.4),        // freeGpuMemoryFraction
        testing::Values(false),                    // enableTrtOverlap
        testing::Values(false),                    // enableChunkedContext
        testing::Values(false),                    // enableStreamingMode
@ -1128,7 +1128,7 @@ INSTANTIATE_TEST_SUITE_P(GptTests, ParamTest,
            BeamConfig{1, {1}}, BeamConfig{2, {2}} // , BeamConfig{2, {1, 2}}
            ),
        testing::Values(std::nullopt, 1280),       // maxTokensInPagedKvCache
-        testing::Values(std::nullopt, 0.8),        // freeGpuMemoryFraction
+        testing::Values(std::nullopt, 0.4),        // freeGpuMemoryFraction
        testing::Values(false, true),              // enableTrtOverlap
        testing::Values(false),                    // enableChunkedContext
        testing::Values(false),                    // enableStreamingMode
@ -1154,7 +1154,7 @@ INSTANTIATE_TEST_SUITE_P(GptRandomEndIdTests, ParamTest,
            BeamConfig{1, {1}}, BeamConfig{2, {2}} // , BeamConfig{2, {1, 2}}
            ),
        testing::Values(std::nullopt, 1280),       // maxTokensInPagedKvCache
-        testing::Values(std::nullopt, 0.8),        // freeGpuMemoryFraction
+        testing::Values(std::nullopt, 0.4),        // freeGpuMemoryFraction
        testing::Values(false, true),              // enableTrtOverlap
        testing::Values(false),                    // enableChunkedContext
        testing::Values(false),                    // enableStreamingMode
@ -1178,7 +1178,7 @@ INSTANTIATE_TEST_SUITE_P(GptKVOffloadingTest, ParamTest,
            TrtGptModelIfbTestType::BULK, TrtGptModelIfbTestType::WAVEFRONT, TrtGptModelIfbTestType::RANDOM),
        testing::Values(BeamConfig{1, {1}}),
        testing::Values(256),               // maxTokensInPagedKvCache
-        testing::Values(std::nullopt, 0.8), // freeGpuMemoryFraction
+        testing::Values(std::nullopt, 0.4), // freeGpuMemoryFraction
        testing::Values(false, true),       // enableTrtOverlap
        testing::Values(false),             // enableChunkedContext
        testing::Values(false),             // enableStreamingMode
@ -1210,7 +1210,7 @@ INSTANTIATE_TEST_SUITE_P(GptCudaGraphTests, ParamTest,
            BeamConfig{1, {1}}, BeamConfig{2, {2}} // , BeamConfig{2, {1, 2}}
            ),
        testing::Values(std::nullopt),             // maxTokensInPagedKvCache
-        testing::Values(std::nullopt),             // freeGpuMemoryFraction
+        testing::Values(0.4),                      // freeGpuMemoryFraction
        testing::Values(false),                    // enableTrtOverlap
        testing::Values(false, true),              // enableChunkedContext
        testing::Values(false),                    // enableStreamingMode
@ -1237,7 +1237,7 @@ INSTANTIATE_TEST_SUITE_P(GptSwitchBwTests, ParamTest,
            BeamConfig{2, {1}}         // , BeamConfig{2, {1, 2}}
            ),
        testing::Values(std::nullopt), // maxTokensInPagedKvCache
-        testing::Values(std::nullopt), // freeGpuMemoryFraction
+        testing::Values(0.4),          // freeGpuMemoryFraction
        testing::Values(false),        // enableTrtOverlap
        testing::Values(true),         // enableChunkedContext
        testing::Values(false),        // enableStreamingMode
@ -1261,7 +1261,7 @@ INSTANTIATE_TEST_SUITE_P(GptNProfilesTests, ParamTest,
            BeamConfig{1, {1}}, BeamConfig{2, {2}} // , BeamConfig{2, {1, 2}}
            ),
        testing::Values(std::nullopt, 1280),       // maxTokensInPagedKvCache
-        testing::Values(std::nullopt, 0.8),        // freeGpuMemoryFraction
+        testing::Values(std::nullopt, 0.4),        // freeGpuMemoryFraction
        testing::Values(false, true),              // enableTrtOverlap
        testing::Values(false, true),              // enableChunkedContext
        testing::Values(false),                    // enableStreamingMode
@ -1301,7 +1301,7 @@ INSTANTIATE_TEST_SUITE_P(GptSqTests, ParamTest,
            BeamConfig{1, {1}}         //, BeamConfig{2, {2}}
            ),
        testing::Values(std::nullopt), // maxTokensInPagedKvCache
-        testing::Values(std::nullopt), // freeGpuMemoryFraction
+        testing::Values(0.4),          // freeGpuMemoryFraction
        testing::Values(false),        // enableTrtOverlap
        testing::Values(false),        // enableChunkedContext
        testing::Values(false),        // enableStreamingMode
@ -1325,7 +1325,7 @@ INSTANTIATE_TEST_SUITE_P(DISABLED_GptChunkedContextTests, ParamTest,
        testing::Values(TrtGptModelIfbTestType::BULK), // TrtGptModelIfbTestType
        testing::Values(BeamConfig{1, {1}}),           // beam config
        testing::Values(257),                          // maxTokensInPagedKvCache
-        testing::Values(std::nullopt),                 // freeGpuMemoryFraction
+        testing::Values(0.4),                          // freeGpuMemoryFraction
        testing::Values(false),                        // enableTrtOverlap
        testing::Values(true),                         // enableChunkedContext
        testing::Values(false),                        // enableStreamingMode
@ -1355,7 +1355,7 @@ INSTANTIATE_TEST_SUITE_P(GptChunkedLongContextTests, ParamTest,
            TrtGptModelIfbTestType::RANDOM), // TrtGptModelIfbTestType
        testing::Values(BeamConfig{1, {1}}), // beam config
        testing::Values(std::nullopt, 1024), // maxTokensInPagedKvCache
-        testing::Values(std::nullopt),       // freeGpuMemoryFraction
+        testing::Values(0.4),                // freeGpuMemoryFraction
        testing::Values(false),              // enableTrtOverlap
        testing::Values(true),               // enableChunkedContext
        testing::Values(false),              // enableStreamingMode
@ -1393,7 +1393,7 @@ INSTANTIATE_TEST_SUITE_P(GptDraftTests, ParamTest,
            TrtGptModelIfbTestType::BULK, TrtGptModelIfbTestType::WAVEFRONT, TrtGptModelIfbTestType::RANDOM),
        testing::Values(BeamConfig{1, {1}}), // beamConfig
        testing::Values(std::nullopt),       // maxTokensInPagedKvCache
-        testing::Values(std::nullopt),       // freeGpuMemoryFraction
+        testing::Values(0.4),                // freeGpuMemoryFraction
        testing::Values(false),              // enableTrtOverlap
        testing::Values(true),               // enableChunkedContext
        testing::Values(false),              // enableStreamingMode
@ -1419,7 +1419,7 @@ INSTANTIATE_TEST_SUITE_P(GptLogitsTests, ParamTest,
            TrtGptModelIfbTestType::RANDOM),                                                        // testType
        testing::Values(BeamConfig{1, {1}}),                                                        // beamConfig
        testing::Values(std::nullopt), // maxTokensInPagedKvCache
-        testing::Values(std::nullopt), // freeGpuMemoryFraction
+        testing::Values(0.4),          // freeGpuMemoryFraction
        testing::Values(false),        // enableTrtOverlap
        testing::Values(true),         // enableChunkedContext
        testing::Values(false, true),  // enableStreamingMode
@ -1445,7 +1445,7 @@ INSTANTIATE_TEST_SUITE_P(GptLogProbsTests, ParamTest,
            TrtGptModelIfbTestType::RANDOM),                     // testType
        testing::Values(BeamConfig{1, {1}}),                     // beamConfig
        testing::Values(std::nullopt),                           // maxTokensInPagedKvCache
-        testing::Values(std::nullopt),                           // freeGpuMemoryFraction
+        testing::Values(0.4),                                    // freeGpuMemoryFraction
        testing::Values(false),                                  // enableTrtOverlap
        testing::Values(true),                                   // enableChunkedContext
        testing::Values(false),                                  // enableStreamingMode
@ -1478,7 +1478,7 @@ INSTANTIATE_TEST_SUITE_P(GptjTests, ParamTest,
            BeamConfig{1, {1}}, BeamConfig{2, {2}} // , BeamConfig{2, {1, 2}}
            ),
        testing::Values(std::nullopt),             // maxTokensInPagedKvCache
-        testing::Values(std::nullopt),             // freeGpuMemoryFraction
+        testing::Values(0.4),                      // freeGpuMemoryFraction
        testing::Values(false),                    // enableTrtOverlap
        testing::Values(false),                    // enableChunkedContext
        testing::Values(false),                    // enableStreamingMode
@ -1507,7 +1507,7 @@ INSTANTIATE_TEST_SUITE_P(MambaTests, ParamTest,
            TrtGptModelIfbTestType::BULK, TrtGptModelIfbTestType::WAVEFRONT, TrtGptModelIfbTestType::RANDOM),
        testing::Values(BeamConfig{1, {1}}),
        testing::Values(std::nullopt), // maxTokensInPagedKvCache
-        testing::Values(std::nullopt), // freeGpuMemoryFraction
+        testing::Values(0.4),          // freeGpuMemoryFraction
        testing::Values(false),        // enableTrtOverlap
        testing::Values(false),        // enableChunkedContext
        testing::Values(false),        // enableStreamingMode
@ -1530,7 +1530,7 @@ INSTANTIATE_TEST_SUITE_P(RecurrentGemmaTests, ParamTest,
            TrtGptModelIfbTestType::BULK, TrtGptModelIfbTestType::WAVEFRONT, TrtGptModelIfbTestType::RANDOM),
        testing::Values(BeamConfig{1, {1}}),
        testing::Values(std::nullopt), // maxTokensInPagedKvCache
-        testing::Values(std::nullopt), // freeGpuMemoryFraction
+        testing::Values(0.4),          // freeGpuMemoryFraction
        testing::Values(false),        // enableTrtOverlap
        testing::Values(false),        // enableChunkedContext
        testing::Values(false),        // enableStreamingMode
@ -1574,7 +1574,7 @@ INSTANTIATE_TEST_SUITE_P(LlamaTests, ParamTest,
            BeamConfig{1, {1}}, BeamConfig{2, {2}} // , BeamConfig{2, {1, 2}}
            ),
        testing::Values(std::nullopt),             // maxTokensInPagedKvCache
-        testing::Values(std::nullopt),             // freeGpuMemoryFraction
+        testing::Values(0.4),                      // freeGpuMemoryFraction
        testing::Values(false),                    // enableTrtOverlap
        testing::Values(false),                    // enableChunkedContext
        testing::Values(false),                    // enableStreamingMode
@ -1597,7 +1597,7 @@ INSTANTIATE_TEST_SUITE_P(ChatGlmTests, ParamTest,
            TrtGptModelIfbTestType::BULK, TrtGptModelIfbTestType::WAVEFRONT, TrtGptModelIfbTestType::RANDOM),
        testing::Values(BeamConfig{1, {1}}),
        testing::Values(std::nullopt), // maxTokensInPagedKvCache
-        testing::Values(std::nullopt), // freeGpuMemoryFraction
+        testing::Values(0.4),          // freeGpuMemoryFraction
        testing::Values(false),        // enableTrtOverlap
        testing::Values(false, true),  // enableChunkedContext
        testing::Values(false),        // enableStreamingMode
@ -1621,7 +1621,7 @@ INSTANTIATE_TEST_SUITE_P(ChatGlm0Tests, ParamTest,
            TrtGptModelIfbTestType::BULK, TrtGptModelIfbTestType::WAVEFRONT, TrtGptModelIfbTestType::RANDOM),
        testing::Values(BeamConfig{1, {1}}),
        testing::Values(std::nullopt), // maxTokensInPagedKvCache
-        testing::Values(std::nullopt), // freeGpuMemoryFraction
+        testing::Values(0.4),          // freeGpuMemoryFraction
        testing::Values(false),        // enableTrtOverlap
        testing::Values(false),        // enableChunkedContext
        testing::Values(false),        // enableStreamingMode
@ -1646,7 +1646,7 @@ INSTANTIATE_TEST_SUITE_P(MedusaTests, ParamTest,
        testing::Values(TrtGptModelType::InflightFusedBatching), testing::Values(TrtGptModelIfbTestType::BULK),
        testing::Values(BeamConfig{1, {1}}),
        testing::Values(std::nullopt), // maxTokensInPagedKvCache
-        testing::Values(std::nullopt), // freeGpuMemoryFraction
+        testing::Values(0.4),          // freeGpuMemoryFraction
        testing::Values(false),        // enableTrtOverlap
        testing::Values(false, true),  // enableChunkedContext
        testing::Values(false),        // enableStreamingMode
@ -1670,7 +1670,7 @@ INSTANTIATE_TEST_SUITE_P(EagleTests, ParamTest,
        testing::Values(TrtGptModelIfbTestType::BULK, TrtGptModelIfbTestType::WAVEFRONT),
        testing::Values(BeamConfig{1, {1}}),
        testing::Values(std::nullopt), // maxTokensInPagedKvCache
-        testing::Values(std::nullopt), // freeGpuMemoryFraction
+        testing::Values(0.4),          // freeGpuMemoryFraction
        testing::Values(false),        // enableTrtOverlap
        testing::Values(false, true),  // enableChunkedContext
        testing::Values(false),        // enableStreamingMode
@ -1695,7 +1695,7 @@ INSTANTIATE_TEST_SUITE_P(LlamaLookaheadDecodingTests, ParamTest,
            TrtGptModelIfbTestType::BULK, TrtGptModelIfbTestType::WAVEFRONT, TrtGptModelIfbTestType::RANDOM),
        testing::Values(BeamConfig{1, {1}}), // beamConfig
        testing::Values(std::nullopt),       // maxTokensInPagedKvCache
-        testing::Values(std::nullopt),       // freeGpuMemoryFraction
+        testing::Values(0.4),                // freeGpuMemoryFraction
        testing::Values(false),              // enableTrtOverlap
        testing::Values(false),              // enableChunkedContext
        testing::Values(false),              // enableStreamingMode
@ -1720,7 +1720,7 @@ INSTANTIATE_TEST_SUITE_P(ExplicitDraftTokensDecodingTests, ParamTest,
        testing::Values(TrtGptModelType::InflightFusedBatching), testing::Values(TrtGptModelIfbTestType::BULK),
        testing::Values(BeamConfig{1, {1}}), // beamConfig
        testing::Values(std::nullopt),       // maxTokensInPagedKvCache
-        testing::Values(std::nullopt),       // freeGpuMemoryFraction
+        testing::Values(0.4),                // freeGpuMemoryFraction
        testing::Values(false),              // enableTrtOverlap
        testing::Values(false, true),        // enableChunkedContext
        testing::Values(false),              // enableStreamingMode
@ -1751,7 +1751,7 @@ INSTANTIATE_TEST_SUITE_P(GptjFP8Tests, ParamTest,
            BeamConfig{1, {1}}         // , BeamConfig{2, {2}}, BeamConfig{2, {1, 2}}
            ),
        testing::Values(std::nullopt), // maxTokensInPagedKvCache
-        testing::Values(std::nullopt), // freeGpuMemoryFraction
+        testing::Values(0.4),          // freeGpuMemoryFraction
        testing::Values(false),        // enableTrtOverlap
        testing::Values(true),         // enableChunkedContext
        testing::Values(false),        // enableStreamingMode
--- a/cpp/tests/executor/encDecTest.cpp
+++ b/cpp/tests/executor/encDecTest.cpp
@ -125,8 +125,8 @@ TEST_P(EncDecParamsTest, validEncDecCtor)
    std::filesystem::path encEnginePath = ENC_DEC_ENGINE_BASE / enginePathName / "encoder";
    std::filesystem::path decEnginePath = ENC_DEC_ENGINE_BASE / enginePathName / "decoder";
    ExecutorConfig executorConfig{};
-    FloatType freeGpuMemoryFraction = 0.5f;
-    FloatType crossKvCacheFraction = 0.5f;
+    FloatType freeGpuMemoryFraction = 0.4f;
+    FloatType crossKvCacheFraction = 0.4f;
    KvCacheConfig kvCacheConfig{false, std::nullopt, std::nullopt, std::nullopt, freeGpuMemoryFraction};
    kvCacheConfig.setCrossKvCacheFraction(crossKvCacheFraction);
    executorConfig.setKvCacheConfig(kvCacheConfig);
--- a/cpp/tests/executor/executorTest.cpp
+++ b/cpp/tests/executor/executorTest.cpp
@ -207,7 +207,7 @@ TEST_F(GptExecutorTest, ReturnAcceptedTokenLogits)

    // Enable kv cache reuse of executorConfig
    bool enableBlockReuse = true;
-    FloatType freeGpuMemoryFraction = 0.5;
+    FloatType freeGpuMemoryFraction = 0.4;
    auto kvCacheConfig
        = KvCacheConfig(enableBlockReuse, std::nullopt, std::nullopt, std::nullopt, freeGpuMemoryFraction);
    executorConfig.setKvCacheConfig(kvCacheConfig);
--- a/jenkins/L0_Test.groovy
+++ b/jenkins/L0_Test.groovy
@ -1238,6 +1238,7 @@ def launchTestJobs(pipeline, testFilter, dockerNode=null)
        "A10-TensorRT-[Post-Merge]-2": ["a10", "l0_a10", 2, 2],
        "A30-TensorRT-[Post-Merge]-1": ["a30", "l0_a30", 1, 2],
        "A30-TensorRT-[Post-Merge]-2": ["a30", "l0_a30", 2, 2],
+        "A30-CPP-[Post-Merge]-1": ["a30", "l0_a30", 1, 1],
        "A100X-TensorRT-[Post-Merge]-1": ["a100x", "l0_a100", 1, 2],
        "A100X-TensorRT-[Post-Merge]-2": ["a100x", "l0_a100", 2, 2],
        "L40S-TensorRT-[Post-Merge]-1": ["l40s", "l0_l40s", 1, 2],
--- a/tests/integration/defs/cpp_common.py
+++ b/tests/integration/defs/cpp_common.py
@ -737,15 +737,7 @@ def run_single_gpu_tests(build_dir: _pl.Path,
        if excluded_tests:
            ctest.extend(["-E", "|".join(excluded_tests)])

-        gpt_tests = {"gpt", "gpt_session", "gpt_tests", "gpt_executor"}
-
-        # gpt* tests are not parallelized as it would cause OOM because kv cache memory allocations
-        # exist in multiple running tests
-        if gpt_tests.intersection(test_list):
-            parallel = 1
-        else:
-            parallel = default_test_parallel
-
+        parallel = default_test_parallel
        if parallel_override := _os.environ.get("LLM_TEST_PARALLEL_OVERRIDE",
                                                None):
            parallel = int(parallel_override)
--- a/tests/integration/test_lists/test-db/l0_a30.yml
+++ b/tests/integration/test_lists/test-db/l0_a30.yml
@ -42,9 +42,7 @@ l0_a30:
  - test_cpp.py::test_unit_tests[80]
  - test_cpp.py::test_model[gpt-80]
  - test_cpp.py::test_model[gpt_executor-80]
-  - test_cpp.py::test_model[gpt_session-80]
  - test_cpp.py::test_model[gpt_tests-80]
-  - test_cpp.py::test_benchmarks[gpt-80]
 - condition:
    ranges:
      system_gpu_count:
@ -145,3 +143,18 @@ l0_a30:
  - examples/test_draft_target_model.py::test_llm_draft_target_model_1gpu[no_streaming-gpt2-use_cpp_session-use_tokens-draft_len_4-float16-bs2] # 1 min
  - examples/test_draft_target_model.py::test_llm_draft_target_model_1gpu[no_streaming-gpt2-use_cpp_session-use_logits-draft_len_4-float16-bs2] # 1 min
  - examples/test_prompt_lookup.py::test_llm_prompt_lookup_1gpu[no_streaming-gpt2-use_cpp_session-use_tokens-max_matching_ngram_size_2-prompt_lookup_num_tokens_8-float16-bs2] # 1 min
+- condition:
+    ranges:
+      system_gpu_count:
+        gte: 1
+        lte: 1
+    wildcards:
+      gpu:
+      - '*a30*'
+      linux_distribution_name: ubuntu*
+    terms:
+      stage: post_merge
+      backend: cpp
+  tests:
+  - test_cpp.py::test_model[gpt_session-80]
+  - test_cpp.py::test_benchmarks[gpt-80]