mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-14 06:27:45 +08:00
tests: add llama 3.3 70b 2 nodes tests (#4391)
* add llama 3.3 70b 2 nodes tests Signed-off-by: xinhe-nv <200704525+xinhe-nv@users.noreply.github.com> * remove enable_overlap_scheduler parameter Signed-off-by: xinhe-nv <200704525+xinhe-nv@users.noreply.github.com> --------- Signed-off-by: xinhe-nv <200704525+xinhe-nv@users.noreply.github.com>
This commit is contained in:
parent
6a35c599ef
commit
750f412b8f
@ -1551,20 +1551,19 @@ def test_ptq_quickstart_advanced_mtp(llm_root, llm_venv, model_name,
|
||||
|
||||
@pytest.mark.skip_less_device_memory(80000)
|
||||
@pytest.mark.skip_less_device(8)
|
||||
@pytest.mark.parametrize("model_name,model_path", [
|
||||
pytest.param('DeepSeek-V3', 'DeepSeek-V3', marks=skip_pre_hopper),
|
||||
])
|
||||
@skip_pre_hopper
|
||||
@skip_post_blackwell
|
||||
@pytest.mark.parametrize("model_path", ['DeepSeek-V3'])
|
||||
def test_ptp_quickstart_advanced_deepseek_v3_2nodes_8gpus(
|
||||
llm_root, llm_venv, model_name, model_path):
|
||||
llm_root, llm_venv, model_path):
|
||||
# "RCCA https://nvbugs/5163844"
|
||||
print(f"Testing {model_name}.")
|
||||
print(f"Testing {model_path}.")
|
||||
example_root = Path(os.path.join(llm_root, "examples", "pytorch"))
|
||||
run_cmd = [
|
||||
"trtllm-llmapi-launch",
|
||||
"python3",
|
||||
str(example_root / "quickstart_advanced.py"),
|
||||
"--model_dir",
|
||||
f"{llm_models_root()}/{model_path}",
|
||||
f"--model_dir={llm_models_root()}/{model_path}",
|
||||
"--moe_ep_size=8",
|
||||
"--tp_size=16",
|
||||
"--use_cuda_graph",
|
||||
@ -2063,4 +2062,30 @@ def test_ptp_scaffolding(llm_root, llm_venv, model_name, model_path):
|
||||
])
|
||||
|
||||
|
||||
@pytest.mark.skip_less_device_memory(80000)
|
||||
@pytest.mark.skip_less_device(4)
|
||||
@pytest.mark.parametrize("model_path", [
|
||||
pytest.param('llama-3.3-models/Llama-3.3-70B-Instruct',
|
||||
marks=skip_pre_hopper),
|
||||
pytest.param('Llama-4-Maverick-17B-128E-Instruct', marks=skip_pre_hopper),
|
||||
])
|
||||
def test_ptp_quickstart_advanced_llama_2nodes(llm_root, llm_venv, model_path):
|
||||
print(f"Testing {model_path}.")
|
||||
example_root = Path(os.path.join(llm_root, "examples", "pytorch"))
|
||||
run_cmd = [
|
||||
"trtllm-llmapi-launch",
|
||||
"python3",
|
||||
str(example_root / "quickstart_advanced.py"),
|
||||
f"--model_dir={llm_models_root()}/{model_path}",
|
||||
"--moe_ep_size=8",
|
||||
"--tp_size=16",
|
||||
"--use_cuda_graph",
|
||||
f"--kv_cache_fraction={_MEM_FRACTION_50}",
|
||||
"--max_batch_size=32",
|
||||
"--max_num_tokens=2048",
|
||||
"--disable_kv_cache_reuse",
|
||||
]
|
||||
check_call(" ".join(run_cmd), shell=True, env=llm_venv._new_env)
|
||||
|
||||
|
||||
# End of Pivot-To-Python examples
|
||||
|
||||
@ -2,5 +2,6 @@ examples/test_llama.py::test_llm_llama_v3_1_2nodes_8gpus[llama-3.1-8b-disable_fp
|
||||
examples/test_llama.py::test_llm_llama_v3_1_2nodes_8gpus[llama-3.1-8b-disable_fp8-tp16pp1-infer]
|
||||
examples/test_mixtral.py::test_llm_mixtral_2nodes_8gpus[Mixtral-8x22B-v0.1-plugin-renormalize-tensor_parallel-build]
|
||||
examples/test_mixtral.py::test_llm_mixtral_2nodes_8gpus[Mixtral-8x22B-v0.1-plugin-renormalize-tensor_parallel-infer]
|
||||
test_e2e.py::test_ptp_quickstart_advanced_deepseek_v3_2nodes_8gpus[DeepSeek-V3-DeepSeek-V3]
|
||||
test_e2e.py::test_ptp_quickstart_advanced_deepseek_v3_2nodes_8gpus[DeepSeek-V3]
|
||||
test_e2e.py::test_ptp_quickstart_advanced_llama_2nodes[llama-3.3-models/Llama-3.3-70B-Instruct]
|
||||
test_e2e.py::test_openai_multinodes_chat_tp16pp1
|
||||
|
||||
Loading…
Reference in New Issue
Block a user