chore: increase A30 for cpp test (#3811)

* increase A30 for cpp test

Signed-off-by: junq <22017000+QiJune@users.noreply.github.com>

* enable parallel run test for gpt_executor

Signed-off-by: junq <22017000+QiJune@users.noreply.github.com>

* clean

Signed-off-by: junq <22017000+QiJune@users.noreply.github.com>

* decrease freeGpuMemoryFraction of cpp tests

Signed-off-by: junq <22017000+QiJune@users.noreply.github.com>

* fix

Signed-off-by: junq <22017000+QiJune@users.noreply.github.com>

---------

Signed-off-by: junq <22017000+QiJune@users.noreply.github.com>
This commit is contained in:
QI JUN 2025-04-24 16:34:39 -07:00 committed by GitHub
parent d72add1794
commit 991939a0f4
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 45 additions and 39 deletions

View File

@ -1066,7 +1066,7 @@ INSTANTIATE_TEST_SUITE_P(GptV1Tests, ParamTest,
BeamConfig{1, {1}}, BeamConfig{2, {2}} // , BeamConfig{2, {1, 2}}
),
testing::Values(std::nullopt), // maxTokensInPagedKvCache
testing::Values(std::nullopt, 0.8), // freeGpuMemoryFraction
testing::Values(0.4), // freeGpuMemoryFraction
testing::Values(false), // enableTrtOverlap
testing::Values(false), // enableChunkedContext
testing::Values(false), // enableStreamingMode
@ -1092,7 +1092,7 @@ INSTANTIATE_TEST_SUITE_P(GptV1RandomEndIdTests, ParamTest,
BeamConfig{1, {1}}, BeamConfig{2, {2}} // , BeamConfig{2, {1, 2}}
),
testing::Values(std::nullopt), // maxTokensInPagedKvCache
testing::Values(std::nullopt, 0.8), // freeGpuMemoryFraction
testing::Values(std::nullopt, 0.4), // freeGpuMemoryFraction
testing::Values(false), // enableTrtOverlap
testing::Values(false), // enableChunkedContext
testing::Values(false), // enableStreamingMode
@ -1128,7 +1128,7 @@ INSTANTIATE_TEST_SUITE_P(GptTests, ParamTest,
BeamConfig{1, {1}}, BeamConfig{2, {2}} // , BeamConfig{2, {1, 2}}
),
testing::Values(std::nullopt, 1280), // maxTokensInPagedKvCache
testing::Values(std::nullopt, 0.8), // freeGpuMemoryFraction
testing::Values(std::nullopt, 0.4), // freeGpuMemoryFraction
testing::Values(false, true), // enableTrtOverlap
testing::Values(false), // enableChunkedContext
testing::Values(false), // enableStreamingMode
@ -1154,7 +1154,7 @@ INSTANTIATE_TEST_SUITE_P(GptRandomEndIdTests, ParamTest,
BeamConfig{1, {1}}, BeamConfig{2, {2}} // , BeamConfig{2, {1, 2}}
),
testing::Values(std::nullopt, 1280), // maxTokensInPagedKvCache
testing::Values(std::nullopt, 0.8), // freeGpuMemoryFraction
testing::Values(std::nullopt, 0.4), // freeGpuMemoryFraction
testing::Values(false, true), // enableTrtOverlap
testing::Values(false), // enableChunkedContext
testing::Values(false), // enableStreamingMode
@ -1178,7 +1178,7 @@ INSTANTIATE_TEST_SUITE_P(GptKVOffloadingTest, ParamTest,
TrtGptModelIfbTestType::BULK, TrtGptModelIfbTestType::WAVEFRONT, TrtGptModelIfbTestType::RANDOM),
testing::Values(BeamConfig{1, {1}}),
testing::Values(256), // maxTokensInPagedKvCache
testing::Values(std::nullopt, 0.8), // freeGpuMemoryFraction
testing::Values(std::nullopt, 0.4), // freeGpuMemoryFraction
testing::Values(false, true), // enableTrtOverlap
testing::Values(false), // enableChunkedContext
testing::Values(false), // enableStreamingMode
@ -1210,7 +1210,7 @@ INSTANTIATE_TEST_SUITE_P(GptCudaGraphTests, ParamTest,
BeamConfig{1, {1}}, BeamConfig{2, {2}} // , BeamConfig{2, {1, 2}}
),
testing::Values(std::nullopt), // maxTokensInPagedKvCache
testing::Values(std::nullopt), // freeGpuMemoryFraction
testing::Values(0.4), // freeGpuMemoryFraction
testing::Values(false), // enableTrtOverlap
testing::Values(false, true), // enableChunkedContext
testing::Values(false), // enableStreamingMode
@ -1237,7 +1237,7 @@ INSTANTIATE_TEST_SUITE_P(GptSwitchBwTests, ParamTest,
BeamConfig{2, {1}} // , BeamConfig{2, {1, 2}}
),
testing::Values(std::nullopt), // maxTokensInPagedKvCache
testing::Values(std::nullopt), // freeGpuMemoryFraction
testing::Values(0.4), // freeGpuMemoryFraction
testing::Values(false), // enableTrtOverlap
testing::Values(true), // enableChunkedContext
testing::Values(false), // enableStreamingMode
@ -1261,7 +1261,7 @@ INSTANTIATE_TEST_SUITE_P(GptNProfilesTests, ParamTest,
BeamConfig{1, {1}}, BeamConfig{2, {2}} // , BeamConfig{2, {1, 2}}
),
testing::Values(std::nullopt, 1280), // maxTokensInPagedKvCache
testing::Values(std::nullopt, 0.8), // freeGpuMemoryFraction
testing::Values(std::nullopt, 0.4), // freeGpuMemoryFraction
testing::Values(false, true), // enableTrtOverlap
testing::Values(false, true), // enableChunkedContext
testing::Values(false), // enableStreamingMode
@ -1301,7 +1301,7 @@ INSTANTIATE_TEST_SUITE_P(GptSqTests, ParamTest,
BeamConfig{1, {1}} //, BeamConfig{2, {2}}
),
testing::Values(std::nullopt), // maxTokensInPagedKvCache
testing::Values(std::nullopt), // freeGpuMemoryFraction
testing::Values(0.4), // freeGpuMemoryFraction
testing::Values(false), // enableTrtOverlap
testing::Values(false), // enableChunkedContext
testing::Values(false), // enableStreamingMode
@ -1325,7 +1325,7 @@ INSTANTIATE_TEST_SUITE_P(DISABLED_GptChunkedContextTests, ParamTest,
testing::Values(TrtGptModelIfbTestType::BULK), // TrtGptModelIfbTestType
testing::Values(BeamConfig{1, {1}}), // beam config
testing::Values(257), // maxTokensInPagedKvCache
testing::Values(std::nullopt), // freeGpuMemoryFraction
testing::Values(0.4), // freeGpuMemoryFraction
testing::Values(false), // enableTrtOverlap
testing::Values(true), // enableChunkedContext
testing::Values(false), // enableStreamingMode
@ -1355,7 +1355,7 @@ INSTANTIATE_TEST_SUITE_P(GptChunkedLongContextTests, ParamTest,
TrtGptModelIfbTestType::RANDOM), // TrtGptModelIfbTestType
testing::Values(BeamConfig{1, {1}}), // beam config
testing::Values(std::nullopt, 1024), // maxTokensInPagedKvCache
testing::Values(std::nullopt), // freeGpuMemoryFraction
testing::Values(0.4), // freeGpuMemoryFraction
testing::Values(false), // enableTrtOverlap
testing::Values(true), // enableChunkedContext
testing::Values(false), // enableStreamingMode
@ -1393,7 +1393,7 @@ INSTANTIATE_TEST_SUITE_P(GptDraftTests, ParamTest,
TrtGptModelIfbTestType::BULK, TrtGptModelIfbTestType::WAVEFRONT, TrtGptModelIfbTestType::RANDOM),
testing::Values(BeamConfig{1, {1}}), // beamConfig
testing::Values(std::nullopt), // maxTokensInPagedKvCache
testing::Values(std::nullopt), // freeGpuMemoryFraction
testing::Values(0.4), // freeGpuMemoryFraction
testing::Values(false), // enableTrtOverlap
testing::Values(true), // enableChunkedContext
testing::Values(false), // enableStreamingMode
@ -1419,7 +1419,7 @@ INSTANTIATE_TEST_SUITE_P(GptLogitsTests, ParamTest,
TrtGptModelIfbTestType::RANDOM), // testType
testing::Values(BeamConfig{1, {1}}), // beamConfig
testing::Values(std::nullopt), // maxTokensInPagedKvCache
testing::Values(std::nullopt), // freeGpuMemoryFraction
testing::Values(0.4), // freeGpuMemoryFraction
testing::Values(false), // enableTrtOverlap
testing::Values(true), // enableChunkedContext
testing::Values(false, true), // enableStreamingMode
@ -1445,7 +1445,7 @@ INSTANTIATE_TEST_SUITE_P(GptLogProbsTests, ParamTest,
TrtGptModelIfbTestType::RANDOM), // testType
testing::Values(BeamConfig{1, {1}}), // beamConfig
testing::Values(std::nullopt), // maxTokensInPagedKvCache
testing::Values(std::nullopt), // freeGpuMemoryFraction
testing::Values(0.4), // freeGpuMemoryFraction
testing::Values(false), // enableTrtOverlap
testing::Values(true), // enableChunkedContext
testing::Values(false), // enableStreamingMode
@ -1478,7 +1478,7 @@ INSTANTIATE_TEST_SUITE_P(GptjTests, ParamTest,
BeamConfig{1, {1}}, BeamConfig{2, {2}} // , BeamConfig{2, {1, 2}}
),
testing::Values(std::nullopt), // maxTokensInPagedKvCache
testing::Values(std::nullopt), // freeGpuMemoryFraction
testing::Values(0.4), // freeGpuMemoryFraction
testing::Values(false), // enableTrtOverlap
testing::Values(false), // enableChunkedContext
testing::Values(false), // enableStreamingMode
@ -1507,7 +1507,7 @@ INSTANTIATE_TEST_SUITE_P(MambaTests, ParamTest,
TrtGptModelIfbTestType::BULK, TrtGptModelIfbTestType::WAVEFRONT, TrtGptModelIfbTestType::RANDOM),
testing::Values(BeamConfig{1, {1}}),
testing::Values(std::nullopt), // maxTokensInPagedKvCache
testing::Values(std::nullopt), // freeGpuMemoryFraction
testing::Values(0.4), // freeGpuMemoryFraction
testing::Values(false), // enableTrtOverlap
testing::Values(false), // enableChunkedContext
testing::Values(false), // enableStreamingMode
@ -1530,7 +1530,7 @@ INSTANTIATE_TEST_SUITE_P(RecurrentGemmaTests, ParamTest,
TrtGptModelIfbTestType::BULK, TrtGptModelIfbTestType::WAVEFRONT, TrtGptModelIfbTestType::RANDOM),
testing::Values(BeamConfig{1, {1}}),
testing::Values(std::nullopt), // maxTokensInPagedKvCache
testing::Values(std::nullopt), // freeGpuMemoryFraction
testing::Values(0.4), // freeGpuMemoryFraction
testing::Values(false), // enableTrtOverlap
testing::Values(false), // enableChunkedContext
testing::Values(false), // enableStreamingMode
@ -1574,7 +1574,7 @@ INSTANTIATE_TEST_SUITE_P(LlamaTests, ParamTest,
BeamConfig{1, {1}}, BeamConfig{2, {2}} // , BeamConfig{2, {1, 2}}
),
testing::Values(std::nullopt), // maxTokensInPagedKvCache
testing::Values(std::nullopt), // freeGpuMemoryFraction
testing::Values(0.4), // freeGpuMemoryFraction
testing::Values(false), // enableTrtOverlap
testing::Values(false), // enableChunkedContext
testing::Values(false), // enableStreamingMode
@ -1597,7 +1597,7 @@ INSTANTIATE_TEST_SUITE_P(ChatGlmTests, ParamTest,
TrtGptModelIfbTestType::BULK, TrtGptModelIfbTestType::WAVEFRONT, TrtGptModelIfbTestType::RANDOM),
testing::Values(BeamConfig{1, {1}}),
testing::Values(std::nullopt), // maxTokensInPagedKvCache
testing::Values(std::nullopt), // freeGpuMemoryFraction
testing::Values(0.4), // freeGpuMemoryFraction
testing::Values(false), // enableTrtOverlap
testing::Values(false, true), // enableChunkedContext
testing::Values(false), // enableStreamingMode
@ -1621,7 +1621,7 @@ INSTANTIATE_TEST_SUITE_P(ChatGlm0Tests, ParamTest,
TrtGptModelIfbTestType::BULK, TrtGptModelIfbTestType::WAVEFRONT, TrtGptModelIfbTestType::RANDOM),
testing::Values(BeamConfig{1, {1}}),
testing::Values(std::nullopt), // maxTokensInPagedKvCache
testing::Values(std::nullopt), // freeGpuMemoryFraction
testing::Values(0.4), // freeGpuMemoryFraction
testing::Values(false), // enableTrtOverlap
testing::Values(false), // enableChunkedContext
testing::Values(false), // enableStreamingMode
@ -1646,7 +1646,7 @@ INSTANTIATE_TEST_SUITE_P(MedusaTests, ParamTest,
testing::Values(TrtGptModelType::InflightFusedBatching), testing::Values(TrtGptModelIfbTestType::BULK),
testing::Values(BeamConfig{1, {1}}),
testing::Values(std::nullopt), // maxTokensInPagedKvCache
testing::Values(std::nullopt), // freeGpuMemoryFraction
testing::Values(0.4), // freeGpuMemoryFraction
testing::Values(false), // enableTrtOverlap
testing::Values(false, true), // enableChunkedContext
testing::Values(false), // enableStreamingMode
@ -1670,7 +1670,7 @@ INSTANTIATE_TEST_SUITE_P(EagleTests, ParamTest,
testing::Values(TrtGptModelIfbTestType::BULK, TrtGptModelIfbTestType::WAVEFRONT),
testing::Values(BeamConfig{1, {1}}),
testing::Values(std::nullopt), // maxTokensInPagedKvCache
testing::Values(std::nullopt), // freeGpuMemoryFraction
testing::Values(0.4), // freeGpuMemoryFraction
testing::Values(false), // enableTrtOverlap
testing::Values(false, true), // enableChunkedContext
testing::Values(false), // enableStreamingMode
@ -1695,7 +1695,7 @@ INSTANTIATE_TEST_SUITE_P(LlamaLookaheadDecodingTests, ParamTest,
TrtGptModelIfbTestType::BULK, TrtGptModelIfbTestType::WAVEFRONT, TrtGptModelIfbTestType::RANDOM),
testing::Values(BeamConfig{1, {1}}), // beamConfig
testing::Values(std::nullopt), // maxTokensInPagedKvCache
testing::Values(std::nullopt), // freeGpuMemoryFraction
testing::Values(0.4), // freeGpuMemoryFraction
testing::Values(false), // enableTrtOverlap
testing::Values(false), // enableChunkedContext
testing::Values(false), // enableStreamingMode
@ -1720,7 +1720,7 @@ INSTANTIATE_TEST_SUITE_P(ExplicitDraftTokensDecodingTests, ParamTest,
testing::Values(TrtGptModelType::InflightFusedBatching), testing::Values(TrtGptModelIfbTestType::BULK),
testing::Values(BeamConfig{1, {1}}), // beamConfig
testing::Values(std::nullopt), // maxTokensInPagedKvCache
testing::Values(std::nullopt), // freeGpuMemoryFraction
testing::Values(0.4), // freeGpuMemoryFraction
testing::Values(false), // enableTrtOverlap
testing::Values(false, true), // enableChunkedContext
testing::Values(false), // enableStreamingMode
@ -1751,7 +1751,7 @@ INSTANTIATE_TEST_SUITE_P(GptjFP8Tests, ParamTest,
BeamConfig{1, {1}} // , BeamConfig{2, {2}}, BeamConfig{2, {1, 2}}
),
testing::Values(std::nullopt), // maxTokensInPagedKvCache
testing::Values(std::nullopt), // freeGpuMemoryFraction
testing::Values(0.4), // freeGpuMemoryFraction
testing::Values(false), // enableTrtOverlap
testing::Values(true), // enableChunkedContext
testing::Values(false), // enableStreamingMode

View File

@ -125,8 +125,8 @@ TEST_P(EncDecParamsTest, validEncDecCtor)
std::filesystem::path encEnginePath = ENC_DEC_ENGINE_BASE / enginePathName / "encoder";
std::filesystem::path decEnginePath = ENC_DEC_ENGINE_BASE / enginePathName / "decoder";
ExecutorConfig executorConfig{};
FloatType freeGpuMemoryFraction = 0.5f;
FloatType crossKvCacheFraction = 0.5f;
FloatType freeGpuMemoryFraction = 0.4f;
FloatType crossKvCacheFraction = 0.4f;
KvCacheConfig kvCacheConfig{false, std::nullopt, std::nullopt, std::nullopt, freeGpuMemoryFraction};
kvCacheConfig.setCrossKvCacheFraction(crossKvCacheFraction);
executorConfig.setKvCacheConfig(kvCacheConfig);

View File

@ -207,7 +207,7 @@ TEST_F(GptExecutorTest, ReturnAcceptedTokenLogits)
// Enable kv cache reuse of executorConfig
bool enableBlockReuse = true;
FloatType freeGpuMemoryFraction = 0.5;
FloatType freeGpuMemoryFraction = 0.4;
auto kvCacheConfig
= KvCacheConfig(enableBlockReuse, std::nullopt, std::nullopt, std::nullopt, freeGpuMemoryFraction);
executorConfig.setKvCacheConfig(kvCacheConfig);

View File

@ -1238,6 +1238,7 @@ def launchTestJobs(pipeline, testFilter, dockerNode=null)
"A10-TensorRT-[Post-Merge]-2": ["a10", "l0_a10", 2, 2],
"A30-TensorRT-[Post-Merge]-1": ["a30", "l0_a30", 1, 2],
"A30-TensorRT-[Post-Merge]-2": ["a30", "l0_a30", 2, 2],
"A30-CPP-[Post-Merge]-1": ["a30", "l0_a30", 1, 1],
"A100X-TensorRT-[Post-Merge]-1": ["a100x", "l0_a100", 1, 2],
"A100X-TensorRT-[Post-Merge]-2": ["a100x", "l0_a100", 2, 2],
"L40S-TensorRT-[Post-Merge]-1": ["l40s", "l0_l40s", 1, 2],

View File

@ -737,15 +737,7 @@ def run_single_gpu_tests(build_dir: _pl.Path,
if excluded_tests:
ctest.extend(["-E", "|".join(excluded_tests)])
gpt_tests = {"gpt", "gpt_session", "gpt_tests", "gpt_executor"}
# gpt* tests are not parallelized as it would cause OOM because kv cache memory allocations
# exist in multiple running tests
if gpt_tests.intersection(test_list):
parallel = 1
else:
parallel = default_test_parallel
parallel = default_test_parallel
if parallel_override := _os.environ.get("LLM_TEST_PARALLEL_OVERRIDE",
None):
parallel = int(parallel_override)

View File

@ -42,9 +42,7 @@ l0_a30:
- test_cpp.py::test_unit_tests[80]
- test_cpp.py::test_model[gpt-80]
- test_cpp.py::test_model[gpt_executor-80]
- test_cpp.py::test_model[gpt_session-80]
- test_cpp.py::test_model[gpt_tests-80]
- test_cpp.py::test_benchmarks[gpt-80]
- condition:
ranges:
system_gpu_count:
@ -145,3 +143,18 @@ l0_a30:
- examples/test_draft_target_model.py::test_llm_draft_target_model_1gpu[no_streaming-gpt2-use_cpp_session-use_tokens-draft_len_4-float16-bs2] # 1 min
- examples/test_draft_target_model.py::test_llm_draft_target_model_1gpu[no_streaming-gpt2-use_cpp_session-use_logits-draft_len_4-float16-bs2] # 1 min
- examples/test_prompt_lookup.py::test_llm_prompt_lookup_1gpu[no_streaming-gpt2-use_cpp_session-use_tokens-max_matching_ngram_size_2-prompt_lookup_num_tokens_8-float16-bs2] # 1 min
- condition:
ranges:
system_gpu_count:
gte: 1
lte: 1
wildcards:
gpu:
- '*a30*'
linux_distribution_name: ubuntu*
terms:
stage: post_merge
backend: cpp
tests:
- test_cpp.py::test_model[gpt_session-80]
- test_cpp.py::test_benchmarks[gpt-80]