mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-14 06:27:45 +08:00
[None][fix] fix Llama3 eagle3 test case OOM (#6832)
Signed-off-by: Ivy Zhang <25222398+crazydemo@users.noreply.github.com>
This commit is contained in:
parent
0958efdcff
commit
fd8f417bf2
@ -172,7 +172,8 @@ meta-llama/Llama-3.2-3B:
|
||||
kv_cache_quant_algo: FP8
|
||||
accuracy: 33.629
|
||||
meta-llama/Llama-3.3-70B-Instruct:
|
||||
- spec_dec_algo: Eagle
|
||||
- quant_algo: FP8
|
||||
spec_dec_algo: Eagle
|
||||
accuracy: 33.244
|
||||
- quant_algo: NVFP4
|
||||
kv_cache_quant_algo: FP8
|
||||
|
||||
@ -59,7 +59,8 @@ meta-llama/Llama-3.2-3B:
|
||||
accuracy: 60.60
|
||||
meta-llama/Llama-3.3-70B-Instruct:
|
||||
- accuracy: 81.31
|
||||
- spec_dec_algo: Eagle
|
||||
- quant_algo: FP8
|
||||
spec_dec_algo: Eagle
|
||||
accuracy: 81.31
|
||||
- quant_algo: NVFP4
|
||||
kv_cache_quant_algo: FP8
|
||||
|
||||
@ -383,25 +383,27 @@ class TestLlama3_3_70BInstruct(LlmapiAccuracyTestHarness):
|
||||
task.evaluate(llm,
|
||||
extra_evaluator_kwargs=dict(apply_chat_template=True))
|
||||
|
||||
@skip_pre_hopper
|
||||
@pytest.mark.skip_less_mpi_world_size(8)
|
||||
@parametrize_with_ids("eagle3_one_model", [True, False])
|
||||
def test_eagle3_tp8(self, eagle3_one_model):
|
||||
model_path = f"{llm_models_root()}/llama-3.3-models/Llama-3.3-70B-Instruct"
|
||||
def test_fp8_eagle3_tp8(self, eagle3_one_model):
|
||||
model_path = f"{llm_models_root()}/modelopt-hf-model-hub/Llama-3.3-70B-Instruct-fp8"
|
||||
eagle_model_dir = f"{llm_models_root()}/EAGLE3-LLaMA3.3-Instruct-70B"
|
||||
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.6)
|
||||
spec_config = EagleDecodingConfig(max_draft_len=4,
|
||||
speculative_model_dir=eagle_model_dir,
|
||||
eagle3_one_model=eagle3_one_model)
|
||||
pytorch_config = dict(disable_overlap_scheduler=True, )
|
||||
pytorch_config = dict(
|
||||
disable_overlap_scheduler=True,
|
||||
cuda_graph_config=CudaGraphConfig(max_batch_size=1))
|
||||
with LLM(model_path,
|
||||
max_batch_size=16,
|
||||
tensor_parallel_size=8,
|
||||
speculative_config=spec_config,
|
||||
kv_cache_config=kv_cache_config,
|
||||
**pytorch_config) as llm:
|
||||
task = CnnDailymail(self.MODEL_NAME)
|
||||
task.evaluate(llm)
|
||||
task = MMLU(self.MODEL_NAME)
|
||||
task.evaluate(llm)
|
||||
|
||||
@pytest.mark.skip_less_device(4)
|
||||
@skip_pre_hopper
|
||||
|
||||
@ -450,8 +450,8 @@ accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_guided_decoding_
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_guided_decoding_4gpus[llguidance]
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_tp4
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_nvfp4_tp4
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_eagle3_tp8[eagle3_one_model=True]
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_eagle3_tp8[eagle3_one_model=False]
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_eagle3_tp8[eagle3_one_model=True]
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_eagle3_tp8[eagle3_one_model=False]
|
||||
accuracy/test_llm_api_pytorch.py::TestMistral7B::test_auto_dtype
|
||||
accuracy/test_llm_api_pytorch.py::TestGemma3_1BInstruct::test_auto_dtype
|
||||
accuracy/test_llm_api_pytorch.py::TestMistralSmall24B::test_auto_dtype
|
||||
|
||||
@ -66,8 +66,8 @@ accuracy/test_llm_api_pytorch.py::TestLlama3_2_3B::test_auto_dtype
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_2_3B::test_fp8_prequantized
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_tp4
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_nvfp4_tp4
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_eagle3_tp8[eagle3_one_model=True]
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_eagle3_tp8[eagle3_one_model=False]
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_eagle3_tp8[eagle3_one_model=True]
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_eagle3_tp8[eagle3_one_model=False]
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_auto_dtype[tp8-cuda_graph=False]
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_auto_dtype[tp8ep4-cuda_graph=True]
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_auto_dtype[tp8ep8-cuda_graph=True]
|
||||
|
||||
Loading…
Reference in New Issue
Block a user