mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-14 06:27:45 +08:00
[TRTQA-2920][chore] improve hang tests (#6781)
Signed-off-by: Xin He (SW-GPU) <200704525+xinhe-nv@users.noreply.github.com>
This commit is contained in:
parent
8845e0f065
commit
e35fca4272
@ -505,11 +505,12 @@ class TestLlama3_1_8BInstruct(LlmapiAccuracyTestHarness):
|
||||
task = JsonModeEval(self.MODEL_NAME)
|
||||
task.evaluate(llm)
|
||||
|
||||
@pytest.mark.skip_less_device(2)
|
||||
@pytest.mark.parametrize("tp,pp", [(1, 2), (2, 1), (2, 2)],
|
||||
ids=["tp1pp2", "tp2pp1", "tp2pp2"])
|
||||
@pytest.mark.parametrize("testset", ["GSM8K", "MMLU"])
|
||||
def test_tp_pp_symmetric(self, tp, pp, testset):
|
||||
if tp * pp * 2 > get_device_count():
|
||||
pytest.skip(f"Not enough devices for tp={tp}*pp={pp} test")
|
||||
return run_parallel_test(self.MODEL_NAME, self.MODEL_PATH, pp, tp, pp,
|
||||
tp, get_accuracy_task(testset))
|
||||
|
||||
@ -517,6 +518,9 @@ class TestLlama3_1_8BInstruct(LlmapiAccuracyTestHarness):
|
||||
@parametrize_with_ids("gen_tp", [1, 2])
|
||||
@pytest.mark.parametrize("testset", ["GSM8K", "MMLU"])
|
||||
def test_ctx_pp_gen_tp_asymmetric(self, ctx_pp, gen_tp, testset):
|
||||
if ctx_pp * gen_tp * 2 > get_device_count():
|
||||
pytest.skip(
|
||||
f"Not enough devices for ctx_pp={ctx_pp}*gen_tp={gen_tp} test")
|
||||
return run_parallel_test(self.MODEL_NAME, self.MODEL_PATH, ctx_pp, 1, 1,
|
||||
gen_tp, get_accuracy_task(testset))
|
||||
|
||||
@ -527,6 +531,7 @@ class TestLlama4ScoutInstruct(LlmapiAccuracyTestHarness):
|
||||
MODEL_NAME = "meta-llama/Llama-4-Scout-17B-16E-Instruct"
|
||||
MODEL_PATH = f"{llm_models_root()}/llama4-models/Llama-4-Scout-17B-16E-Instruct"
|
||||
|
||||
@pytest.mark.skip_less_device(8)
|
||||
@pytest.mark.parametrize("overlap_scheduler", [False, True])
|
||||
def test_auto_dtype(self, overlap_scheduler):
|
||||
ctx_server_config = {"disable_overlap_scheduler": True}
|
||||
@ -565,6 +570,7 @@ class TestDeepSeekV3Lite(LlmapiAccuracyTestHarness):
|
||||
MODEL_NAME = "deepseek-ai/DeepSeek-V3-Lite"
|
||||
MODEL_PATH = f"{llm_models_root()}/DeepSeek-V3-Lite/bf16"
|
||||
|
||||
@pytest.mark.skip_less_device(8)
|
||||
@parametrize_with_ids("overlap_scheduler", [True, False])
|
||||
@parametrize_with_ids("mtp_nextn",
|
||||
[0, pytest.param(2, marks=skip_pre_hopper)])
|
||||
|
||||
@ -250,7 +250,7 @@ class TestLlama3_1_8BInstruct(LlmapiAccuracyTestHarness):
|
||||
enable_padding=True),
|
||||
)
|
||||
kv_cache_config = KvCacheConfig(
|
||||
enable_block_reuse=True
|
||||
enable_block_reuse=True, free_gpu_memory_fraction=0.8
|
||||
) # both one-model and two-model supports this feature
|
||||
|
||||
eagle_model_dir = f"{llm_models_root()}/EAGLE3-LLaMA3.1-Instruct-8B"
|
||||
@ -280,7 +280,8 @@ class TestLlama3_1_8BInstruct(LlmapiAccuracyTestHarness):
|
||||
cuda_graph_config=CudaGraphConfig(batch_sizes=[1]),
|
||||
)
|
||||
|
||||
kv_cache_config = KvCacheConfig(enable_block_reuse=False)
|
||||
kv_cache_config = KvCacheConfig(enable_block_reuse=False,
|
||||
free_gpu_memory_fraction=0.8)
|
||||
|
||||
spec_config = NGramDecodingConfig(
|
||||
max_draft_len=4,
|
||||
|
||||
Loading…
Reference in New Issue
Block a user