mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-02-16 07:53:55 +08:00
chore: increase A30 for cpp test (#3811)
* increase A30 for cpp test Signed-off-by: junq <22017000+QiJune@users.noreply.github.com> * enable parallel run test for gpt_executor Signed-off-by: junq <22017000+QiJune@users.noreply.github.com> * clean Signed-off-by: junq <22017000+QiJune@users.noreply.github.com> * decrease freeGpuMemoryFraction of cpp tests Signed-off-by: junq <22017000+QiJune@users.noreply.github.com> * fix Signed-off-by: junq <22017000+QiJune@users.noreply.github.com> --------- Signed-off-by: junq <22017000+QiJune@users.noreply.github.com>
This commit is contained in:
parent
d72add1794
commit
991939a0f4
@ -1066,7 +1066,7 @@ INSTANTIATE_TEST_SUITE_P(GptV1Tests, ParamTest,
|
||||
BeamConfig{1, {1}}, BeamConfig{2, {2}} // , BeamConfig{2, {1, 2}}
|
||||
),
|
||||
testing::Values(std::nullopt), // maxTokensInPagedKvCache
|
||||
testing::Values(std::nullopt, 0.8), // freeGpuMemoryFraction
|
||||
testing::Values(0.4), // freeGpuMemoryFraction
|
||||
testing::Values(false), // enableTrtOverlap
|
||||
testing::Values(false), // enableChunkedContext
|
||||
testing::Values(false), // enableStreamingMode
|
||||
@ -1092,7 +1092,7 @@ INSTANTIATE_TEST_SUITE_P(GptV1RandomEndIdTests, ParamTest,
|
||||
BeamConfig{1, {1}}, BeamConfig{2, {2}} // , BeamConfig{2, {1, 2}}
|
||||
),
|
||||
testing::Values(std::nullopt), // maxTokensInPagedKvCache
|
||||
testing::Values(std::nullopt, 0.8), // freeGpuMemoryFraction
|
||||
testing::Values(std::nullopt, 0.4), // freeGpuMemoryFraction
|
||||
testing::Values(false), // enableTrtOverlap
|
||||
testing::Values(false), // enableChunkedContext
|
||||
testing::Values(false), // enableStreamingMode
|
||||
@ -1128,7 +1128,7 @@ INSTANTIATE_TEST_SUITE_P(GptTests, ParamTest,
|
||||
BeamConfig{1, {1}}, BeamConfig{2, {2}} // , BeamConfig{2, {1, 2}}
|
||||
),
|
||||
testing::Values(std::nullopt, 1280), // maxTokensInPagedKvCache
|
||||
testing::Values(std::nullopt, 0.8), // freeGpuMemoryFraction
|
||||
testing::Values(std::nullopt, 0.4), // freeGpuMemoryFraction
|
||||
testing::Values(false, true), // enableTrtOverlap
|
||||
testing::Values(false), // enableChunkedContext
|
||||
testing::Values(false), // enableStreamingMode
|
||||
@ -1154,7 +1154,7 @@ INSTANTIATE_TEST_SUITE_P(GptRandomEndIdTests, ParamTest,
|
||||
BeamConfig{1, {1}}, BeamConfig{2, {2}} // , BeamConfig{2, {1, 2}}
|
||||
),
|
||||
testing::Values(std::nullopt, 1280), // maxTokensInPagedKvCache
|
||||
testing::Values(std::nullopt, 0.8), // freeGpuMemoryFraction
|
||||
testing::Values(std::nullopt, 0.4), // freeGpuMemoryFraction
|
||||
testing::Values(false, true), // enableTrtOverlap
|
||||
testing::Values(false), // enableChunkedContext
|
||||
testing::Values(false), // enableStreamingMode
|
||||
@ -1178,7 +1178,7 @@ INSTANTIATE_TEST_SUITE_P(GptKVOffloadingTest, ParamTest,
|
||||
TrtGptModelIfbTestType::BULK, TrtGptModelIfbTestType::WAVEFRONT, TrtGptModelIfbTestType::RANDOM),
|
||||
testing::Values(BeamConfig{1, {1}}),
|
||||
testing::Values(256), // maxTokensInPagedKvCache
|
||||
testing::Values(std::nullopt, 0.8), // freeGpuMemoryFraction
|
||||
testing::Values(std::nullopt, 0.4), // freeGpuMemoryFraction
|
||||
testing::Values(false, true), // enableTrtOverlap
|
||||
testing::Values(false), // enableChunkedContext
|
||||
testing::Values(false), // enableStreamingMode
|
||||
@ -1210,7 +1210,7 @@ INSTANTIATE_TEST_SUITE_P(GptCudaGraphTests, ParamTest,
|
||||
BeamConfig{1, {1}}, BeamConfig{2, {2}} // , BeamConfig{2, {1, 2}}
|
||||
),
|
||||
testing::Values(std::nullopt), // maxTokensInPagedKvCache
|
||||
testing::Values(std::nullopt), // freeGpuMemoryFraction
|
||||
testing::Values(0.4), // freeGpuMemoryFraction
|
||||
testing::Values(false), // enableTrtOverlap
|
||||
testing::Values(false, true), // enableChunkedContext
|
||||
testing::Values(false), // enableStreamingMode
|
||||
@ -1237,7 +1237,7 @@ INSTANTIATE_TEST_SUITE_P(GptSwitchBwTests, ParamTest,
|
||||
BeamConfig{2, {1}} // , BeamConfig{2, {1, 2}}
|
||||
),
|
||||
testing::Values(std::nullopt), // maxTokensInPagedKvCache
|
||||
testing::Values(std::nullopt), // freeGpuMemoryFraction
|
||||
testing::Values(0.4), // freeGpuMemoryFraction
|
||||
testing::Values(false), // enableTrtOverlap
|
||||
testing::Values(true), // enableChunkedContext
|
||||
testing::Values(false), // enableStreamingMode
|
||||
@ -1261,7 +1261,7 @@ INSTANTIATE_TEST_SUITE_P(GptNProfilesTests, ParamTest,
|
||||
BeamConfig{1, {1}}, BeamConfig{2, {2}} // , BeamConfig{2, {1, 2}}
|
||||
),
|
||||
testing::Values(std::nullopt, 1280), // maxTokensInPagedKvCache
|
||||
testing::Values(std::nullopt, 0.8), // freeGpuMemoryFraction
|
||||
testing::Values(std::nullopt, 0.4), // freeGpuMemoryFraction
|
||||
testing::Values(false, true), // enableTrtOverlap
|
||||
testing::Values(false, true), // enableChunkedContext
|
||||
testing::Values(false), // enableStreamingMode
|
||||
@ -1301,7 +1301,7 @@ INSTANTIATE_TEST_SUITE_P(GptSqTests, ParamTest,
|
||||
BeamConfig{1, {1}} //, BeamConfig{2, {2}}
|
||||
),
|
||||
testing::Values(std::nullopt), // maxTokensInPagedKvCache
|
||||
testing::Values(std::nullopt), // freeGpuMemoryFraction
|
||||
testing::Values(0.4), // freeGpuMemoryFraction
|
||||
testing::Values(false), // enableTrtOverlap
|
||||
testing::Values(false), // enableChunkedContext
|
||||
testing::Values(false), // enableStreamingMode
|
||||
@ -1325,7 +1325,7 @@ INSTANTIATE_TEST_SUITE_P(DISABLED_GptChunkedContextTests, ParamTest,
|
||||
testing::Values(TrtGptModelIfbTestType::BULK), // TrtGptModelIfbTestType
|
||||
testing::Values(BeamConfig{1, {1}}), // beam config
|
||||
testing::Values(257), // maxTokensInPagedKvCache
|
||||
testing::Values(std::nullopt), // freeGpuMemoryFraction
|
||||
testing::Values(0.4), // freeGpuMemoryFraction
|
||||
testing::Values(false), // enableTrtOverlap
|
||||
testing::Values(true), // enableChunkedContext
|
||||
testing::Values(false), // enableStreamingMode
|
||||
@ -1355,7 +1355,7 @@ INSTANTIATE_TEST_SUITE_P(GptChunkedLongContextTests, ParamTest,
|
||||
TrtGptModelIfbTestType::RANDOM), // TrtGptModelIfbTestType
|
||||
testing::Values(BeamConfig{1, {1}}), // beam config
|
||||
testing::Values(std::nullopt, 1024), // maxTokensInPagedKvCache
|
||||
testing::Values(std::nullopt), // freeGpuMemoryFraction
|
||||
testing::Values(0.4), // freeGpuMemoryFraction
|
||||
testing::Values(false), // enableTrtOverlap
|
||||
testing::Values(true), // enableChunkedContext
|
||||
testing::Values(false), // enableStreamingMode
|
||||
@ -1393,7 +1393,7 @@ INSTANTIATE_TEST_SUITE_P(GptDraftTests, ParamTest,
|
||||
TrtGptModelIfbTestType::BULK, TrtGptModelIfbTestType::WAVEFRONT, TrtGptModelIfbTestType::RANDOM),
|
||||
testing::Values(BeamConfig{1, {1}}), // beamConfig
|
||||
testing::Values(std::nullopt), // maxTokensInPagedKvCache
|
||||
testing::Values(std::nullopt), // freeGpuMemoryFraction
|
||||
testing::Values(0.4), // freeGpuMemoryFraction
|
||||
testing::Values(false), // enableTrtOverlap
|
||||
testing::Values(true), // enableChunkedContext
|
||||
testing::Values(false), // enableStreamingMode
|
||||
@ -1419,7 +1419,7 @@ INSTANTIATE_TEST_SUITE_P(GptLogitsTests, ParamTest,
|
||||
TrtGptModelIfbTestType::RANDOM), // testType
|
||||
testing::Values(BeamConfig{1, {1}}), // beamConfig
|
||||
testing::Values(std::nullopt), // maxTokensInPagedKvCache
|
||||
testing::Values(std::nullopt), // freeGpuMemoryFraction
|
||||
testing::Values(0.4), // freeGpuMemoryFraction
|
||||
testing::Values(false), // enableTrtOverlap
|
||||
testing::Values(true), // enableChunkedContext
|
||||
testing::Values(false, true), // enableStreamingMode
|
||||
@ -1445,7 +1445,7 @@ INSTANTIATE_TEST_SUITE_P(GptLogProbsTests, ParamTest,
|
||||
TrtGptModelIfbTestType::RANDOM), // testType
|
||||
testing::Values(BeamConfig{1, {1}}), // beamConfig
|
||||
testing::Values(std::nullopt), // maxTokensInPagedKvCache
|
||||
testing::Values(std::nullopt), // freeGpuMemoryFraction
|
||||
testing::Values(0.4), // freeGpuMemoryFraction
|
||||
testing::Values(false), // enableTrtOverlap
|
||||
testing::Values(true), // enableChunkedContext
|
||||
testing::Values(false), // enableStreamingMode
|
||||
@ -1478,7 +1478,7 @@ INSTANTIATE_TEST_SUITE_P(GptjTests, ParamTest,
|
||||
BeamConfig{1, {1}}, BeamConfig{2, {2}} // , BeamConfig{2, {1, 2}}
|
||||
),
|
||||
testing::Values(std::nullopt), // maxTokensInPagedKvCache
|
||||
testing::Values(std::nullopt), // freeGpuMemoryFraction
|
||||
testing::Values(0.4), // freeGpuMemoryFraction
|
||||
testing::Values(false), // enableTrtOverlap
|
||||
testing::Values(false), // enableChunkedContext
|
||||
testing::Values(false), // enableStreamingMode
|
||||
@ -1507,7 +1507,7 @@ INSTANTIATE_TEST_SUITE_P(MambaTests, ParamTest,
|
||||
TrtGptModelIfbTestType::BULK, TrtGptModelIfbTestType::WAVEFRONT, TrtGptModelIfbTestType::RANDOM),
|
||||
testing::Values(BeamConfig{1, {1}}),
|
||||
testing::Values(std::nullopt), // maxTokensInPagedKvCache
|
||||
testing::Values(std::nullopt), // freeGpuMemoryFraction
|
||||
testing::Values(0.4), // freeGpuMemoryFraction
|
||||
testing::Values(false), // enableTrtOverlap
|
||||
testing::Values(false), // enableChunkedContext
|
||||
testing::Values(false), // enableStreamingMode
|
||||
@ -1530,7 +1530,7 @@ INSTANTIATE_TEST_SUITE_P(RecurrentGemmaTests, ParamTest,
|
||||
TrtGptModelIfbTestType::BULK, TrtGptModelIfbTestType::WAVEFRONT, TrtGptModelIfbTestType::RANDOM),
|
||||
testing::Values(BeamConfig{1, {1}}),
|
||||
testing::Values(std::nullopt), // maxTokensInPagedKvCache
|
||||
testing::Values(std::nullopt), // freeGpuMemoryFraction
|
||||
testing::Values(0.4), // freeGpuMemoryFraction
|
||||
testing::Values(false), // enableTrtOverlap
|
||||
testing::Values(false), // enableChunkedContext
|
||||
testing::Values(false), // enableStreamingMode
|
||||
@ -1574,7 +1574,7 @@ INSTANTIATE_TEST_SUITE_P(LlamaTests, ParamTest,
|
||||
BeamConfig{1, {1}}, BeamConfig{2, {2}} // , BeamConfig{2, {1, 2}}
|
||||
),
|
||||
testing::Values(std::nullopt), // maxTokensInPagedKvCache
|
||||
testing::Values(std::nullopt), // freeGpuMemoryFraction
|
||||
testing::Values(0.4), // freeGpuMemoryFraction
|
||||
testing::Values(false), // enableTrtOverlap
|
||||
testing::Values(false), // enableChunkedContext
|
||||
testing::Values(false), // enableStreamingMode
|
||||
@ -1597,7 +1597,7 @@ INSTANTIATE_TEST_SUITE_P(ChatGlmTests, ParamTest,
|
||||
TrtGptModelIfbTestType::BULK, TrtGptModelIfbTestType::WAVEFRONT, TrtGptModelIfbTestType::RANDOM),
|
||||
testing::Values(BeamConfig{1, {1}}),
|
||||
testing::Values(std::nullopt), // maxTokensInPagedKvCache
|
||||
testing::Values(std::nullopt), // freeGpuMemoryFraction
|
||||
testing::Values(0.4), // freeGpuMemoryFraction
|
||||
testing::Values(false), // enableTrtOverlap
|
||||
testing::Values(false, true), // enableChunkedContext
|
||||
testing::Values(false), // enableStreamingMode
|
||||
@ -1621,7 +1621,7 @@ INSTANTIATE_TEST_SUITE_P(ChatGlm0Tests, ParamTest,
|
||||
TrtGptModelIfbTestType::BULK, TrtGptModelIfbTestType::WAVEFRONT, TrtGptModelIfbTestType::RANDOM),
|
||||
testing::Values(BeamConfig{1, {1}}),
|
||||
testing::Values(std::nullopt), // maxTokensInPagedKvCache
|
||||
testing::Values(std::nullopt), // freeGpuMemoryFraction
|
||||
testing::Values(0.4), // freeGpuMemoryFraction
|
||||
testing::Values(false), // enableTrtOverlap
|
||||
testing::Values(false), // enableChunkedContext
|
||||
testing::Values(false), // enableStreamingMode
|
||||
@ -1646,7 +1646,7 @@ INSTANTIATE_TEST_SUITE_P(MedusaTests, ParamTest,
|
||||
testing::Values(TrtGptModelType::InflightFusedBatching), testing::Values(TrtGptModelIfbTestType::BULK),
|
||||
testing::Values(BeamConfig{1, {1}}),
|
||||
testing::Values(std::nullopt), // maxTokensInPagedKvCache
|
||||
testing::Values(std::nullopt), // freeGpuMemoryFraction
|
||||
testing::Values(0.4), // freeGpuMemoryFraction
|
||||
testing::Values(false), // enableTrtOverlap
|
||||
testing::Values(false, true), // enableChunkedContext
|
||||
testing::Values(false), // enableStreamingMode
|
||||
@ -1670,7 +1670,7 @@ INSTANTIATE_TEST_SUITE_P(EagleTests, ParamTest,
|
||||
testing::Values(TrtGptModelIfbTestType::BULK, TrtGptModelIfbTestType::WAVEFRONT),
|
||||
testing::Values(BeamConfig{1, {1}}),
|
||||
testing::Values(std::nullopt), // maxTokensInPagedKvCache
|
||||
testing::Values(std::nullopt), // freeGpuMemoryFraction
|
||||
testing::Values(0.4), // freeGpuMemoryFraction
|
||||
testing::Values(false), // enableTrtOverlap
|
||||
testing::Values(false, true), // enableChunkedContext
|
||||
testing::Values(false), // enableStreamingMode
|
||||
@ -1695,7 +1695,7 @@ INSTANTIATE_TEST_SUITE_P(LlamaLookaheadDecodingTests, ParamTest,
|
||||
TrtGptModelIfbTestType::BULK, TrtGptModelIfbTestType::WAVEFRONT, TrtGptModelIfbTestType::RANDOM),
|
||||
testing::Values(BeamConfig{1, {1}}), // beamConfig
|
||||
testing::Values(std::nullopt), // maxTokensInPagedKvCache
|
||||
testing::Values(std::nullopt), // freeGpuMemoryFraction
|
||||
testing::Values(0.4), // freeGpuMemoryFraction
|
||||
testing::Values(false), // enableTrtOverlap
|
||||
testing::Values(false), // enableChunkedContext
|
||||
testing::Values(false), // enableStreamingMode
|
||||
@ -1720,7 +1720,7 @@ INSTANTIATE_TEST_SUITE_P(ExplicitDraftTokensDecodingTests, ParamTest,
|
||||
testing::Values(TrtGptModelType::InflightFusedBatching), testing::Values(TrtGptModelIfbTestType::BULK),
|
||||
testing::Values(BeamConfig{1, {1}}), // beamConfig
|
||||
testing::Values(std::nullopt), // maxTokensInPagedKvCache
|
||||
testing::Values(std::nullopt), // freeGpuMemoryFraction
|
||||
testing::Values(0.4), // freeGpuMemoryFraction
|
||||
testing::Values(false), // enableTrtOverlap
|
||||
testing::Values(false, true), // enableChunkedContext
|
||||
testing::Values(false), // enableStreamingMode
|
||||
@ -1751,7 +1751,7 @@ INSTANTIATE_TEST_SUITE_P(GptjFP8Tests, ParamTest,
|
||||
BeamConfig{1, {1}} // , BeamConfig{2, {2}}, BeamConfig{2, {1, 2}}
|
||||
),
|
||||
testing::Values(std::nullopt), // maxTokensInPagedKvCache
|
||||
testing::Values(std::nullopt), // freeGpuMemoryFraction
|
||||
testing::Values(0.4), // freeGpuMemoryFraction
|
||||
testing::Values(false), // enableTrtOverlap
|
||||
testing::Values(true), // enableChunkedContext
|
||||
testing::Values(false), // enableStreamingMode
|
||||
|
||||
@ -125,8 +125,8 @@ TEST_P(EncDecParamsTest, validEncDecCtor)
|
||||
std::filesystem::path encEnginePath = ENC_DEC_ENGINE_BASE / enginePathName / "encoder";
|
||||
std::filesystem::path decEnginePath = ENC_DEC_ENGINE_BASE / enginePathName / "decoder";
|
||||
ExecutorConfig executorConfig{};
|
||||
FloatType freeGpuMemoryFraction = 0.5f;
|
||||
FloatType crossKvCacheFraction = 0.5f;
|
||||
FloatType freeGpuMemoryFraction = 0.4f;
|
||||
FloatType crossKvCacheFraction = 0.4f;
|
||||
KvCacheConfig kvCacheConfig{false, std::nullopt, std::nullopt, std::nullopt, freeGpuMemoryFraction};
|
||||
kvCacheConfig.setCrossKvCacheFraction(crossKvCacheFraction);
|
||||
executorConfig.setKvCacheConfig(kvCacheConfig);
|
||||
|
||||
@ -207,7 +207,7 @@ TEST_F(GptExecutorTest, ReturnAcceptedTokenLogits)
|
||||
|
||||
// Enable kv cache reuse of executorConfig
|
||||
bool enableBlockReuse = true;
|
||||
FloatType freeGpuMemoryFraction = 0.5;
|
||||
FloatType freeGpuMemoryFraction = 0.4;
|
||||
auto kvCacheConfig
|
||||
= KvCacheConfig(enableBlockReuse, std::nullopt, std::nullopt, std::nullopt, freeGpuMemoryFraction);
|
||||
executorConfig.setKvCacheConfig(kvCacheConfig);
|
||||
|
||||
@ -1238,6 +1238,7 @@ def launchTestJobs(pipeline, testFilter, dockerNode=null)
|
||||
"A10-TensorRT-[Post-Merge]-2": ["a10", "l0_a10", 2, 2],
|
||||
"A30-TensorRT-[Post-Merge]-1": ["a30", "l0_a30", 1, 2],
|
||||
"A30-TensorRT-[Post-Merge]-2": ["a30", "l0_a30", 2, 2],
|
||||
"A30-CPP-[Post-Merge]-1": ["a30", "l0_a30", 1, 1],
|
||||
"A100X-TensorRT-[Post-Merge]-1": ["a100x", "l0_a100", 1, 2],
|
||||
"A100X-TensorRT-[Post-Merge]-2": ["a100x", "l0_a100", 2, 2],
|
||||
"L40S-TensorRT-[Post-Merge]-1": ["l40s", "l0_l40s", 1, 2],
|
||||
|
||||
@ -737,15 +737,7 @@ def run_single_gpu_tests(build_dir: _pl.Path,
|
||||
if excluded_tests:
|
||||
ctest.extend(["-E", "|".join(excluded_tests)])
|
||||
|
||||
gpt_tests = {"gpt", "gpt_session", "gpt_tests", "gpt_executor"}
|
||||
|
||||
# gpt* tests are not parallelized as it would cause OOM because kv cache memory allocations
|
||||
# exist in multiple running tests
|
||||
if gpt_tests.intersection(test_list):
|
||||
parallel = 1
|
||||
else:
|
||||
parallel = default_test_parallel
|
||||
|
||||
parallel = default_test_parallel
|
||||
if parallel_override := _os.environ.get("LLM_TEST_PARALLEL_OVERRIDE",
|
||||
None):
|
||||
parallel = int(parallel_override)
|
||||
|
||||
@ -42,9 +42,7 @@ l0_a30:
|
||||
- test_cpp.py::test_unit_tests[80]
|
||||
- test_cpp.py::test_model[gpt-80]
|
||||
- test_cpp.py::test_model[gpt_executor-80]
|
||||
- test_cpp.py::test_model[gpt_session-80]
|
||||
- test_cpp.py::test_model[gpt_tests-80]
|
||||
- test_cpp.py::test_benchmarks[gpt-80]
|
||||
- condition:
|
||||
ranges:
|
||||
system_gpu_count:
|
||||
@ -145,3 +143,18 @@ l0_a30:
|
||||
- examples/test_draft_target_model.py::test_llm_draft_target_model_1gpu[no_streaming-gpt2-use_cpp_session-use_tokens-draft_len_4-float16-bs2] # 1 min
|
||||
- examples/test_draft_target_model.py::test_llm_draft_target_model_1gpu[no_streaming-gpt2-use_cpp_session-use_logits-draft_len_4-float16-bs2] # 1 min
|
||||
- examples/test_prompt_lookup.py::test_llm_prompt_lookup_1gpu[no_streaming-gpt2-use_cpp_session-use_tokens-max_matching_ngram_size_2-prompt_lookup_num_tokens_8-float16-bs2] # 1 min
|
||||
- condition:
|
||||
ranges:
|
||||
system_gpu_count:
|
||||
gte: 1
|
||||
lte: 1
|
||||
wildcards:
|
||||
gpu:
|
||||
- '*a30*'
|
||||
linux_distribution_name: ubuntu*
|
||||
terms:
|
||||
stage: post_merge
|
||||
backend: cpp
|
||||
tests:
|
||||
- test_cpp.py::test_model[gpt_session-80]
|
||||
- test_cpp.py::test_benchmarks[gpt-80]
|
||||
|
||||
Loading…
Reference in New Issue
Block a user