TensorRT-LLMs/tests/integration/defs/examples/test_pytorch.py
Enwei Zhu 705eef68c2
test: Accuracy test improvement (Part 2): Incorporate mmlu to accuracy test suite (#2982)
* Accuracy test improvement (Part 2)

Signed-off-by: Enwei Zhu <21126786+syuoni@users.noreply.github.com>

* WAR OOM

Signed-off-by: Enwei Zhu <21126786+syuoni@users.noreply.github.com>

update

Signed-off-by: Enwei Zhu <21126786+syuoni@users.noreply.github.com>

* fix

Signed-off-by: Enwei Zhu <21126786+syuoni@users.noreply.github.com>

* fix

Signed-off-by: Enwei Zhu <21126786+syuoni@users.noreply.github.com>

---------

Signed-off-by: Enwei Zhu <21126786+syuoni@users.noreply.github.com>
2025-03-25 07:34:10 +08:00

158 lines
5.2 KiB
Python

import os
import pytest
from defs.common import venv_check_call
from defs.conftest import llm_models_root, skip_pre_blackwell
@pytest.mark.parametrize("enable_fp4",
[pytest.param(True, marks=skip_pre_blackwell), False],
ids=["enable_fp4", "disable_fp4"])
@pytest.mark.parametrize("model_name", ["llama-3.1-8b"])
def test_llm_llama_1gpu(
mmlu_dataset_root,
enable_fp4,
llama_example_root,
model_name,
llm_venv,
):
models_root = llm_models_root()
if enable_fp4:
model_dir = os.path.join(models_root, "nvfp4-quantized",
"Meta-Llama-3.1-8B")
else:
model_dir = os.path.join(models_root, "llama-3.1-model",
"Meta-Llama-3.1-8B")
print("Run MMLU test")
accuracy_map = {
'llama-3.1-8b': 61,
}
acc_thres = accuracy_map[model_name]
mmlu_cmd = [
f"{llama_example_root}/../mmlu_llmapi.py",
f"--data_dir={mmlu_dataset_root}",
f"--hf_model_dir={model_dir}",
"--backend=pytorch",
"--check_accuracy",
"--enable_chunked_prefill",
f"--accuracy_threshold={acc_thres}",
]
venv_check_call(llm_venv, mmlu_cmd)
@pytest.mark.parametrize("enable_fp4",
[pytest.param(True, marks=skip_pre_blackwell), False],
ids=["enable_fp4", "disable_fp4"])
@pytest.mark.parametrize("enable_fp8", [
pytest.param(True, marks=pytest.mark.skip_device_not_contain(["H100"])),
False
],
ids=["enable_fp8", "disable_fp8"])
@pytest.mark.parametrize("model_name", ["deepseek-v3-lite"])
def test_llm_deepseek_1gpu(
mmlu_dataset_root,
enable_fp4,
enable_fp8,
llama_example_root,
model_name,
llm_venv,
):
models_root = llm_models_root()
if enable_fp4:
model_dir = os.path.join(models_root, "DeepSeek-V3-Lite",
"nvfp4_moe_only")
elif enable_fp8:
model_dir = os.path.join(models_root, "DeepSeek-V3-Lite", "fp8")
else:
model_dir = os.path.join(models_root, "DeepSeek-V3-Lite", "bf16")
print("Run MMLU test")
accuracy_map = {
'deepseek-v3-lite': 68,
}
acc_thres = accuracy_map[model_name]
mmlu_cmd = [
f"{llama_example_root}/../mmlu_llmapi.py",
f"--data_dir={mmlu_dataset_root}",
f"--hf_model_dir={model_dir}",
"--backend=pytorch",
"--check_accuracy",
"--enable_overlap_scheduler",
"--kv_cache_free_gpu_memory_fraction=0.8",
f"--accuracy_threshold={acc_thres}",
]
venv_check_call(llm_venv, mmlu_cmd)
@pytest.mark.skip_less_device_memory(80000)
@pytest.mark.skip_less_device(4)
@pytest.mark.parametrize("model_name,model_path", [
pytest.param('Llama-3.3-70B-Instruct-fp8',
'modelopt-hf-model-hub/Llama-3.3-70B-Instruct-fp8',
marks=pytest.mark.skip_device_not_contain(["B200", "H100"])),
pytest.param('Llama-3.3-70B-Instruct-fp4',
'modelopt-hf-model-hub/Llama-3.3-70B-Instruct-fp4',
marks=pytest.mark.skip_device_not_contain(["B200"])),
])
def test_mmlu_llmapi_4gpus(llm_venv, llama_example_root, mmlu_dataset_root,
model_name, model_path):
models_root = llm_models_root()
model_dir = os.path.join(models_root, model_path)
print(f"Run MMLU test on {model_name}.")
accuracy_map = {
'Llama-3.3-70B-Instruct-fp8': 80.4,
'Llama-3.3-70B-Instruct-fp4': 78.5,
}
acc_thres = accuracy_map[model_name]
mmlu_cmd = [
f"{llama_example_root}/../mmlu_llmapi.py",
f"--data_dir={mmlu_dataset_root}",
f"--hf_model_dir={model_dir}",
"--backend=pytorch",
"--check_accuracy",
"--enable_chunked_prefill",
f"--accuracy_threshold={acc_thres}",
f"--tp_size=4",
]
venv_check_call(llm_venv, mmlu_cmd)
@pytest.mark.skip_less_device_memory(80000)
@pytest.mark.skip_less_device(2)
@pytest.mark.parametrize("model_name,model_path", [
pytest.param('Mixtral-8x7B-Instruct-v0.1-fp8',
'modelopt-hf-model-hub/Mixtral-8x7B-Instruct-v0.1-fp8',
marks=pytest.mark.skip_device_not_contain(["B200", "H100"])),
pytest.param('Mixtral-8x7B-Instruct-v0.1-fp4',
'modelopt-hf-model-hub/Mixtral-8x7B-Instruct-v0.1-fp4',
marks=pytest.mark.skip_device_not_contain(["B200"])),
])
def test_mmlu_llmapi_2gpus(llm_venv, llama_example_root, mmlu_dataset_root,
model_name, model_path):
models_root = llm_models_root()
model_dir = os.path.join(models_root, model_path)
print(f"Run MMLU test on {model_name}.")
accuracy_map = {
'Mixtral-8x7B-Instruct-v0.1-fp8': 67.9,
'Mixtral-8x7B-Instruct-v0.1-fp4': 66.9,
}
acc_thres = accuracy_map[model_name]
mmlu_cmd = [
f"{llama_example_root}/../mmlu_llmapi.py",
f"--data_dir={mmlu_dataset_root}",
f"--hf_model_dir={model_dir}",
"--backend=pytorch",
"--check_accuracy",
"--enable_chunked_prefill",
f"--accuracy_threshold={acc_thres}",
f"--tp_size=2",
]
venv_check_call(llm_venv, mmlu_cmd)