[None][fix] fix Llama3 eagle3 test case OOM (#6832)

Signed-off-by: Ivy Zhang <25222398+crazydemo@users.noreply.github.com>
This commit is contained in:
Ivy Zhang 2025-08-13 14:21:05 +08:00 committed by GitHub
parent 0958efdcff
commit fd8f417bf2
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 15 additions and 11 deletions

View File

@ -172,7 +172,8 @@ meta-llama/Llama-3.2-3B:
kv_cache_quant_algo: FP8
accuracy: 33.629
meta-llama/Llama-3.3-70B-Instruct:
- spec_dec_algo: Eagle
- quant_algo: FP8
spec_dec_algo: Eagle
accuracy: 33.244
- quant_algo: NVFP4
kv_cache_quant_algo: FP8

View File

@ -59,7 +59,8 @@ meta-llama/Llama-3.2-3B:
accuracy: 60.60
meta-llama/Llama-3.3-70B-Instruct:
- accuracy: 81.31
- spec_dec_algo: Eagle
- quant_algo: FP8
spec_dec_algo: Eagle
accuracy: 81.31
- quant_algo: NVFP4
kv_cache_quant_algo: FP8

View File

@ -383,25 +383,27 @@ class TestLlama3_3_70BInstruct(LlmapiAccuracyTestHarness):
task.evaluate(llm,
extra_evaluator_kwargs=dict(apply_chat_template=True))
@skip_pre_hopper
@pytest.mark.skip_less_mpi_world_size(8)
@parametrize_with_ids("eagle3_one_model", [True, False])
def test_eagle3_tp8(self, eagle3_one_model):
model_path = f"{llm_models_root()}/llama-3.3-models/Llama-3.3-70B-Instruct"
def test_fp8_eagle3_tp8(self, eagle3_one_model):
model_path = f"{llm_models_root()}/modelopt-hf-model-hub/Llama-3.3-70B-Instruct-fp8"
eagle_model_dir = f"{llm_models_root()}/EAGLE3-LLaMA3.3-Instruct-70B"
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.6)
spec_config = EagleDecodingConfig(max_draft_len=4,
speculative_model_dir=eagle_model_dir,
eagle3_one_model=eagle3_one_model)
pytorch_config = dict(disable_overlap_scheduler=True, )
pytorch_config = dict(
disable_overlap_scheduler=True,
cuda_graph_config=CudaGraphConfig(max_batch_size=1))
with LLM(model_path,
max_batch_size=16,
tensor_parallel_size=8,
speculative_config=spec_config,
kv_cache_config=kv_cache_config,
**pytorch_config) as llm:
task = CnnDailymail(self.MODEL_NAME)
task.evaluate(llm)
task = MMLU(self.MODEL_NAME)
task.evaluate(llm)
@pytest.mark.skip_less_device(4)
@skip_pre_hopper

View File

@ -450,8 +450,8 @@ accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_guided_decoding_
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_guided_decoding_4gpus[llguidance]
accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_tp4
accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_nvfp4_tp4
accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_eagle3_tp8[eagle3_one_model=True]
accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_eagle3_tp8[eagle3_one_model=False]
accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_eagle3_tp8[eagle3_one_model=True]
accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_eagle3_tp8[eagle3_one_model=False]
accuracy/test_llm_api_pytorch.py::TestMistral7B::test_auto_dtype
accuracy/test_llm_api_pytorch.py::TestGemma3_1BInstruct::test_auto_dtype
accuracy/test_llm_api_pytorch.py::TestMistralSmall24B::test_auto_dtype

View File

@ -66,8 +66,8 @@ accuracy/test_llm_api_pytorch.py::TestLlama3_2_3B::test_auto_dtype
accuracy/test_llm_api_pytorch.py::TestLlama3_2_3B::test_fp8_prequantized
accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_tp4
accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_nvfp4_tp4
accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_eagle3_tp8[eagle3_one_model=True]
accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_eagle3_tp8[eagle3_one_model=False]
accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_eagle3_tp8[eagle3_one_model=True]
accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_eagle3_tp8[eagle3_one_model=False]
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_auto_dtype[tp8-cuda_graph=False]
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_auto_dtype[tp8ep4-cuda_graph=True]
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_auto_dtype[tp8ep8-cuda_graph=True]